cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

vmalloc.c (109181B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  Copyright (C) 1993  Linus Torvalds
      4 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
      5 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
      6 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
      7 *  Numa awareness, Christoph Lameter, SGI, June 2005
      8 *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
      9 */
     10
     11#include <linux/vmalloc.h>
     12#include <linux/mm.h>
     13#include <linux/module.h>
     14#include <linux/highmem.h>
     15#include <linux/sched/signal.h>
     16#include <linux/slab.h>
     17#include <linux/spinlock.h>
     18#include <linux/interrupt.h>
     19#include <linux/proc_fs.h>
     20#include <linux/seq_file.h>
     21#include <linux/set_memory.h>
     22#include <linux/debugobjects.h>
     23#include <linux/kallsyms.h>
     24#include <linux/list.h>
     25#include <linux/notifier.h>
     26#include <linux/rbtree.h>
     27#include <linux/xarray.h>
     28#include <linux/io.h>
     29#include <linux/rcupdate.h>
     30#include <linux/pfn.h>
     31#include <linux/kmemleak.h>
     32#include <linux/atomic.h>
     33#include <linux/compiler.h>
     34#include <linux/memcontrol.h>
     35#include <linux/llist.h>
     36#include <linux/bitops.h>
     37#include <linux/rbtree_augmented.h>
     38#include <linux/overflow.h>
     39#include <linux/pgtable.h>
     40#include <linux/uaccess.h>
     41#include <linux/hugetlb.h>
     42#include <linux/sched/mm.h>
     43#include <asm/tlbflush.h>
     44#include <asm/shmparam.h>
     45
     46#include "internal.h"
     47#include "pgalloc-track.h"
     48
     49#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
     50static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
     51
     52static int __init set_nohugeiomap(char *str)
     53{
     54	ioremap_max_page_shift = PAGE_SHIFT;
     55	return 0;
     56}
     57early_param("nohugeiomap", set_nohugeiomap);
     58#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
     59static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
     60#endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
     61
     62#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
     63static bool __ro_after_init vmap_allow_huge = true;
     64
     65static int __init set_nohugevmalloc(char *str)
     66{
     67	vmap_allow_huge = false;
     68	return 0;
     69}
     70early_param("nohugevmalloc", set_nohugevmalloc);
     71#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
     72static const bool vmap_allow_huge = false;
     73#endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
     74
     75bool is_vmalloc_addr(const void *x)
     76{
     77	unsigned long addr = (unsigned long)kasan_reset_tag(x);
     78
     79	return addr >= VMALLOC_START && addr < VMALLOC_END;
     80}
     81EXPORT_SYMBOL(is_vmalloc_addr);
     82
     83struct vfree_deferred {
     84	struct llist_head list;
     85	struct work_struct wq;
     86};
     87static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
     88
     89static void __vunmap(const void *, int);
     90
     91static void free_work(struct work_struct *w)
     92{
     93	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
     94	struct llist_node *t, *llnode;
     95
     96	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
     97		__vunmap((void *)llnode, 1);
     98}
     99
    100/*** Page table manipulation functions ***/
    101static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
    102			phys_addr_t phys_addr, pgprot_t prot,
    103			unsigned int max_page_shift, pgtbl_mod_mask *mask)
    104{
    105	pte_t *pte;
    106	u64 pfn;
    107	unsigned long size = PAGE_SIZE;
    108
    109	pfn = phys_addr >> PAGE_SHIFT;
    110	pte = pte_alloc_kernel_track(pmd, addr, mask);
    111	if (!pte)
    112		return -ENOMEM;
    113	do {
    114		BUG_ON(!pte_none(*pte));
    115
    116#ifdef CONFIG_HUGETLB_PAGE
    117		size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
    118		if (size != PAGE_SIZE) {
    119			pte_t entry = pfn_pte(pfn, prot);
    120
    121			entry = arch_make_huge_pte(entry, ilog2(size), 0);
    122			set_huge_pte_at(&init_mm, addr, pte, entry);
    123			pfn += PFN_DOWN(size);
    124			continue;
    125		}
    126#endif
    127		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
    128		pfn++;
    129	} while (pte += PFN_DOWN(size), addr += size, addr != end);
    130	*mask |= PGTBL_PTE_MODIFIED;
    131	return 0;
    132}
    133
    134static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
    135			phys_addr_t phys_addr, pgprot_t prot,
    136			unsigned int max_page_shift)
    137{
    138	if (max_page_shift < PMD_SHIFT)
    139		return 0;
    140
    141	if (!arch_vmap_pmd_supported(prot))
    142		return 0;
    143
    144	if ((end - addr) != PMD_SIZE)
    145		return 0;
    146
    147	if (!IS_ALIGNED(addr, PMD_SIZE))
    148		return 0;
    149
    150	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
    151		return 0;
    152
    153	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
    154		return 0;
    155
    156	return pmd_set_huge(pmd, phys_addr, prot);
    157}
    158
    159static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
    160			phys_addr_t phys_addr, pgprot_t prot,
    161			unsigned int max_page_shift, pgtbl_mod_mask *mask)
    162{
    163	pmd_t *pmd;
    164	unsigned long next;
    165
    166	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
    167	if (!pmd)
    168		return -ENOMEM;
    169	do {
    170		next = pmd_addr_end(addr, end);
    171
    172		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
    173					max_page_shift)) {
    174			*mask |= PGTBL_PMD_MODIFIED;
    175			continue;
    176		}
    177
    178		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
    179			return -ENOMEM;
    180	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
    181	return 0;
    182}
    183
    184static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
    185			phys_addr_t phys_addr, pgprot_t prot,
    186			unsigned int max_page_shift)
    187{
    188	if (max_page_shift < PUD_SHIFT)
    189		return 0;
    190
    191	if (!arch_vmap_pud_supported(prot))
    192		return 0;
    193
    194	if ((end - addr) != PUD_SIZE)
    195		return 0;
    196
    197	if (!IS_ALIGNED(addr, PUD_SIZE))
    198		return 0;
    199
    200	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
    201		return 0;
    202
    203	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
    204		return 0;
    205
    206	return pud_set_huge(pud, phys_addr, prot);
    207}
    208
    209static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
    210			phys_addr_t phys_addr, pgprot_t prot,
    211			unsigned int max_page_shift, pgtbl_mod_mask *mask)
    212{
    213	pud_t *pud;
    214	unsigned long next;
    215
    216	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
    217	if (!pud)
    218		return -ENOMEM;
    219	do {
    220		next = pud_addr_end(addr, end);
    221
    222		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
    223					max_page_shift)) {
    224			*mask |= PGTBL_PUD_MODIFIED;
    225			continue;
    226		}
    227
    228		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
    229					max_page_shift, mask))
    230			return -ENOMEM;
    231	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
    232	return 0;
    233}
    234
    235static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
    236			phys_addr_t phys_addr, pgprot_t prot,
    237			unsigned int max_page_shift)
    238{
    239	if (max_page_shift < P4D_SHIFT)
    240		return 0;
    241
    242	if (!arch_vmap_p4d_supported(prot))
    243		return 0;
    244
    245	if ((end - addr) != P4D_SIZE)
    246		return 0;
    247
    248	if (!IS_ALIGNED(addr, P4D_SIZE))
    249		return 0;
    250
    251	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
    252		return 0;
    253
    254	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
    255		return 0;
    256
    257	return p4d_set_huge(p4d, phys_addr, prot);
    258}
    259
    260static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
    261			phys_addr_t phys_addr, pgprot_t prot,
    262			unsigned int max_page_shift, pgtbl_mod_mask *mask)
    263{
    264	p4d_t *p4d;
    265	unsigned long next;
    266
    267	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
    268	if (!p4d)
    269		return -ENOMEM;
    270	do {
    271		next = p4d_addr_end(addr, end);
    272
    273		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
    274					max_page_shift)) {
    275			*mask |= PGTBL_P4D_MODIFIED;
    276			continue;
    277		}
    278
    279		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
    280					max_page_shift, mask))
    281			return -ENOMEM;
    282	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
    283	return 0;
    284}
    285
    286static int vmap_range_noflush(unsigned long addr, unsigned long end,
    287			phys_addr_t phys_addr, pgprot_t prot,
    288			unsigned int max_page_shift)
    289{
    290	pgd_t *pgd;
    291	unsigned long start;
    292	unsigned long next;
    293	int err;
    294	pgtbl_mod_mask mask = 0;
    295
    296	might_sleep();
    297	BUG_ON(addr >= end);
    298
    299	start = addr;
    300	pgd = pgd_offset_k(addr);
    301	do {
    302		next = pgd_addr_end(addr, end);
    303		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
    304					max_page_shift, &mask);
    305		if (err)
    306			break;
    307	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
    308
    309	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
    310		arch_sync_kernel_mappings(start, end);
    311
    312	return err;
    313}
    314
    315int ioremap_page_range(unsigned long addr, unsigned long end,
    316		phys_addr_t phys_addr, pgprot_t prot)
    317{
    318	int err;
    319
    320	err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
    321				 ioremap_max_page_shift);
    322	flush_cache_vmap(addr, end);
    323	return err;
    324}
    325
    326static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
    327			     pgtbl_mod_mask *mask)
    328{
    329	pte_t *pte;
    330
    331	pte = pte_offset_kernel(pmd, addr);
    332	do {
    333		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
    334		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
    335	} while (pte++, addr += PAGE_SIZE, addr != end);
    336	*mask |= PGTBL_PTE_MODIFIED;
    337}
    338
    339static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
    340			     pgtbl_mod_mask *mask)
    341{
    342	pmd_t *pmd;
    343	unsigned long next;
    344	int cleared;
    345
    346	pmd = pmd_offset(pud, addr);
    347	do {
    348		next = pmd_addr_end(addr, end);
    349
    350		cleared = pmd_clear_huge(pmd);
    351		if (cleared || pmd_bad(*pmd))
    352			*mask |= PGTBL_PMD_MODIFIED;
    353
    354		if (cleared)
    355			continue;
    356		if (pmd_none_or_clear_bad(pmd))
    357			continue;
    358		vunmap_pte_range(pmd, addr, next, mask);
    359
    360		cond_resched();
    361	} while (pmd++, addr = next, addr != end);
    362}
    363
    364static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
    365			     pgtbl_mod_mask *mask)
    366{
    367	pud_t *pud;
    368	unsigned long next;
    369	int cleared;
    370
    371	pud = pud_offset(p4d, addr);
    372	do {
    373		next = pud_addr_end(addr, end);
    374
    375		cleared = pud_clear_huge(pud);
    376		if (cleared || pud_bad(*pud))
    377			*mask |= PGTBL_PUD_MODIFIED;
    378
    379		if (cleared)
    380			continue;
    381		if (pud_none_or_clear_bad(pud))
    382			continue;
    383		vunmap_pmd_range(pud, addr, next, mask);
    384	} while (pud++, addr = next, addr != end);
    385}
    386
    387static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
    388			     pgtbl_mod_mask *mask)
    389{
    390	p4d_t *p4d;
    391	unsigned long next;
    392
    393	p4d = p4d_offset(pgd, addr);
    394	do {
    395		next = p4d_addr_end(addr, end);
    396
    397		p4d_clear_huge(p4d);
    398		if (p4d_bad(*p4d))
    399			*mask |= PGTBL_P4D_MODIFIED;
    400
    401		if (p4d_none_or_clear_bad(p4d))
    402			continue;
    403		vunmap_pud_range(p4d, addr, next, mask);
    404	} while (p4d++, addr = next, addr != end);
    405}
    406
    407/*
    408 * vunmap_range_noflush is similar to vunmap_range, but does not
    409 * flush caches or TLBs.
    410 *
    411 * The caller is responsible for calling flush_cache_vmap() before calling
    412 * this function, and flush_tlb_kernel_range after it has returned
    413 * successfully (and before the addresses are expected to cause a page fault
    414 * or be re-mapped for something else, if TLB flushes are being delayed or
    415 * coalesced).
    416 *
    417 * This is an internal function only. Do not use outside mm/.
    418 */
    419void vunmap_range_noflush(unsigned long start, unsigned long end)
    420{
    421	unsigned long next;
    422	pgd_t *pgd;
    423	unsigned long addr = start;
    424	pgtbl_mod_mask mask = 0;
    425
    426	BUG_ON(addr >= end);
    427	pgd = pgd_offset_k(addr);
    428	do {
    429		next = pgd_addr_end(addr, end);
    430		if (pgd_bad(*pgd))
    431			mask |= PGTBL_PGD_MODIFIED;
    432		if (pgd_none_or_clear_bad(pgd))
    433			continue;
    434		vunmap_p4d_range(pgd, addr, next, &mask);
    435	} while (pgd++, addr = next, addr != end);
    436
    437	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
    438		arch_sync_kernel_mappings(start, end);
    439}
    440
    441/**
    442 * vunmap_range - unmap kernel virtual addresses
    443 * @addr: start of the VM area to unmap
    444 * @end: end of the VM area to unmap (non-inclusive)
    445 *
    446 * Clears any present PTEs in the virtual address range, flushes TLBs and
    447 * caches. Any subsequent access to the address before it has been re-mapped
    448 * is a kernel bug.
    449 */
    450void vunmap_range(unsigned long addr, unsigned long end)
    451{
    452	flush_cache_vunmap(addr, end);
    453	vunmap_range_noflush(addr, end);
    454	flush_tlb_kernel_range(addr, end);
    455}
    456
    457static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
    458		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
    459		pgtbl_mod_mask *mask)
    460{
    461	pte_t *pte;
    462
    463	/*
    464	 * nr is a running index into the array which helps higher level
    465	 * callers keep track of where we're up to.
    466	 */
    467
    468	pte = pte_alloc_kernel_track(pmd, addr, mask);
    469	if (!pte)
    470		return -ENOMEM;
    471	do {
    472		struct page *page = pages[*nr];
    473
    474		if (WARN_ON(!pte_none(*pte)))
    475			return -EBUSY;
    476		if (WARN_ON(!page))
    477			return -ENOMEM;
    478		if (WARN_ON(!pfn_valid(page_to_pfn(page))))
    479			return -EINVAL;
    480
    481		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
    482		(*nr)++;
    483	} while (pte++, addr += PAGE_SIZE, addr != end);
    484	*mask |= PGTBL_PTE_MODIFIED;
    485	return 0;
    486}
    487
    488static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
    489		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
    490		pgtbl_mod_mask *mask)
    491{
    492	pmd_t *pmd;
    493	unsigned long next;
    494
    495	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
    496	if (!pmd)
    497		return -ENOMEM;
    498	do {
    499		next = pmd_addr_end(addr, end);
    500		if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
    501			return -ENOMEM;
    502	} while (pmd++, addr = next, addr != end);
    503	return 0;
    504}
    505
    506static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
    507		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
    508		pgtbl_mod_mask *mask)
    509{
    510	pud_t *pud;
    511	unsigned long next;
    512
    513	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
    514	if (!pud)
    515		return -ENOMEM;
    516	do {
    517		next = pud_addr_end(addr, end);
    518		if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
    519			return -ENOMEM;
    520	} while (pud++, addr = next, addr != end);
    521	return 0;
    522}
    523
    524static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
    525		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
    526		pgtbl_mod_mask *mask)
    527{
    528	p4d_t *p4d;
    529	unsigned long next;
    530
    531	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
    532	if (!p4d)
    533		return -ENOMEM;
    534	do {
    535		next = p4d_addr_end(addr, end);
    536		if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
    537			return -ENOMEM;
    538	} while (p4d++, addr = next, addr != end);
    539	return 0;
    540}
    541
    542static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
    543		pgprot_t prot, struct page **pages)
    544{
    545	unsigned long start = addr;
    546	pgd_t *pgd;
    547	unsigned long next;
    548	int err = 0;
    549	int nr = 0;
    550	pgtbl_mod_mask mask = 0;
    551
    552	BUG_ON(addr >= end);
    553	pgd = pgd_offset_k(addr);
    554	do {
    555		next = pgd_addr_end(addr, end);
    556		if (pgd_bad(*pgd))
    557			mask |= PGTBL_PGD_MODIFIED;
    558		err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
    559		if (err)
    560			return err;
    561	} while (pgd++, addr = next, addr != end);
    562
    563	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
    564		arch_sync_kernel_mappings(start, end);
    565
    566	return 0;
    567}
    568
    569/*
    570 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
    571 * flush caches.
    572 *
    573 * The caller is responsible for calling flush_cache_vmap() after this
    574 * function returns successfully and before the addresses are accessed.
    575 *
    576 * This is an internal function only. Do not use outside mm/.
    577 */
    578int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
    579		pgprot_t prot, struct page **pages, unsigned int page_shift)
    580{
    581	unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
    582
    583	WARN_ON(page_shift < PAGE_SHIFT);
    584
    585	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
    586			page_shift == PAGE_SHIFT)
    587		return vmap_small_pages_range_noflush(addr, end, prot, pages);
    588
    589	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
    590		int err;
    591
    592		err = vmap_range_noflush(addr, addr + (1UL << page_shift),
    593					__pa(page_address(pages[i])), prot,
    594					page_shift);
    595		if (err)
    596			return err;
    597
    598		addr += 1UL << page_shift;
    599	}
    600
    601	return 0;
    602}
    603
    604/**
    605 * vmap_pages_range - map pages to a kernel virtual address
    606 * @addr: start of the VM area to map
    607 * @end: end of the VM area to map (non-inclusive)
    608 * @prot: page protection flags to use
    609 * @pages: pages to map (always PAGE_SIZE pages)
    610 * @page_shift: maximum shift that the pages may be mapped with, @pages must
    611 * be aligned and contiguous up to at least this shift.
    612 *
    613 * RETURNS:
    614 * 0 on success, -errno on failure.
    615 */
    616static int vmap_pages_range(unsigned long addr, unsigned long end,
    617		pgprot_t prot, struct page **pages, unsigned int page_shift)
    618{
    619	int err;
    620
    621	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
    622	flush_cache_vmap(addr, end);
    623	return err;
    624}
    625
    626int is_vmalloc_or_module_addr(const void *x)
    627{
    628	/*
    629	 * ARM, x86-64 and sparc64 put modules in a special place,
    630	 * and fall back on vmalloc() if that fails. Others
    631	 * just put it in the vmalloc space.
    632	 */
    633#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
    634	unsigned long addr = (unsigned long)kasan_reset_tag(x);
    635	if (addr >= MODULES_VADDR && addr < MODULES_END)
    636		return 1;
    637#endif
    638	return is_vmalloc_addr(x);
    639}
    640
    641/*
    642 * Walk a vmap address to the struct page it maps. Huge vmap mappings will
    643 * return the tail page that corresponds to the base page address, which
    644 * matches small vmap mappings.
    645 */
    646struct page *vmalloc_to_page(const void *vmalloc_addr)
    647{
    648	unsigned long addr = (unsigned long) vmalloc_addr;
    649	struct page *page = NULL;
    650	pgd_t *pgd = pgd_offset_k(addr);
    651	p4d_t *p4d;
    652	pud_t *pud;
    653	pmd_t *pmd;
    654	pte_t *ptep, pte;
    655
    656	/*
    657	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
    658	 * architectures that do not vmalloc module space
    659	 */
    660	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
    661
    662	if (pgd_none(*pgd))
    663		return NULL;
    664	if (WARN_ON_ONCE(pgd_leaf(*pgd)))
    665		return NULL; /* XXX: no allowance for huge pgd */
    666	if (WARN_ON_ONCE(pgd_bad(*pgd)))
    667		return NULL;
    668
    669	p4d = p4d_offset(pgd, addr);
    670	if (p4d_none(*p4d))
    671		return NULL;
    672	if (p4d_leaf(*p4d))
    673		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
    674	if (WARN_ON_ONCE(p4d_bad(*p4d)))
    675		return NULL;
    676
    677	pud = pud_offset(p4d, addr);
    678	if (pud_none(*pud))
    679		return NULL;
    680	if (pud_leaf(*pud))
    681		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
    682	if (WARN_ON_ONCE(pud_bad(*pud)))
    683		return NULL;
    684
    685	pmd = pmd_offset(pud, addr);
    686	if (pmd_none(*pmd))
    687		return NULL;
    688	if (pmd_leaf(*pmd))
    689		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
    690	if (WARN_ON_ONCE(pmd_bad(*pmd)))
    691		return NULL;
    692
    693	ptep = pte_offset_map(pmd, addr);
    694	pte = *ptep;
    695	if (pte_present(pte))
    696		page = pte_page(pte);
    697	pte_unmap(ptep);
    698
    699	return page;
    700}
    701EXPORT_SYMBOL(vmalloc_to_page);
    702
    703/*
    704 * Map a vmalloc()-space virtual address to the physical page frame number.
    705 */
    706unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
    707{
    708	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
    709}
    710EXPORT_SYMBOL(vmalloc_to_pfn);
    711
    712
    713/*** Global kva allocator ***/
    714
    715#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
    716#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
    717
    718
    719static DEFINE_SPINLOCK(vmap_area_lock);
    720static DEFINE_SPINLOCK(free_vmap_area_lock);
    721/* Export for kexec only */
    722LIST_HEAD(vmap_area_list);
    723static struct rb_root vmap_area_root = RB_ROOT;
    724static bool vmap_initialized __read_mostly;
    725
    726static struct rb_root purge_vmap_area_root = RB_ROOT;
    727static LIST_HEAD(purge_vmap_area_list);
    728static DEFINE_SPINLOCK(purge_vmap_area_lock);
    729
    730/*
    731 * This kmem_cache is used for vmap_area objects. Instead of
    732 * allocating from slab we reuse an object from this cache to
    733 * make things faster. Especially in "no edge" splitting of
    734 * free block.
    735 */
    736static struct kmem_cache *vmap_area_cachep;
    737
    738/*
    739 * This linked list is used in pair with free_vmap_area_root.
    740 * It gives O(1) access to prev/next to perform fast coalescing.
    741 */
    742static LIST_HEAD(free_vmap_area_list);
    743
    744/*
    745 * This augment red-black tree represents the free vmap space.
    746 * All vmap_area objects in this tree are sorted by va->va_start
    747 * address. It is used for allocation and merging when a vmap
    748 * object is released.
    749 *
    750 * Each vmap_area node contains a maximum available free block
    751 * of its sub-tree, right or left. Therefore it is possible to
    752 * find a lowest match of free area.
    753 */
    754static struct rb_root free_vmap_area_root = RB_ROOT;
    755
    756/*
    757 * Preload a CPU with one object for "no edge" split case. The
    758 * aim is to get rid of allocations from the atomic context, thus
    759 * to use more permissive allocation masks.
    760 */
    761static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
    762
    763static __always_inline unsigned long
    764va_size(struct vmap_area *va)
    765{
    766	return (va->va_end - va->va_start);
    767}
    768
    769static __always_inline unsigned long
    770get_subtree_max_size(struct rb_node *node)
    771{
    772	struct vmap_area *va;
    773
    774	va = rb_entry_safe(node, struct vmap_area, rb_node);
    775	return va ? va->subtree_max_size : 0;
    776}
    777
    778RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
    779	struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
    780
    781static void purge_vmap_area_lazy(void);
    782static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
    783static void drain_vmap_area_work(struct work_struct *work);
    784static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
    785
    786static atomic_long_t nr_vmalloc_pages;
    787
    788unsigned long vmalloc_nr_pages(void)
    789{
    790	return atomic_long_read(&nr_vmalloc_pages);
    791}
    792
    793static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
    794{
    795	struct vmap_area *va = NULL;
    796	struct rb_node *n = vmap_area_root.rb_node;
    797
    798	addr = (unsigned long)kasan_reset_tag((void *)addr);
    799
    800	while (n) {
    801		struct vmap_area *tmp;
    802
    803		tmp = rb_entry(n, struct vmap_area, rb_node);
    804		if (tmp->va_end > addr) {
    805			va = tmp;
    806			if (tmp->va_start <= addr)
    807				break;
    808
    809			n = n->rb_left;
    810		} else
    811			n = n->rb_right;
    812	}
    813
    814	return va;
    815}
    816
    817static struct vmap_area *__find_vmap_area(unsigned long addr)
    818{
    819	struct rb_node *n = vmap_area_root.rb_node;
    820
    821	addr = (unsigned long)kasan_reset_tag((void *)addr);
    822
    823	while (n) {
    824		struct vmap_area *va;
    825
    826		va = rb_entry(n, struct vmap_area, rb_node);
    827		if (addr < va->va_start)
    828			n = n->rb_left;
    829		else if (addr >= va->va_end)
    830			n = n->rb_right;
    831		else
    832			return va;
    833	}
    834
    835	return NULL;
    836}
    837
    838/*
    839 * This function returns back addresses of parent node
    840 * and its left or right link for further processing.
    841 *
    842 * Otherwise NULL is returned. In that case all further
    843 * steps regarding inserting of conflicting overlap range
    844 * have to be declined and actually considered as a bug.
    845 */
    846static __always_inline struct rb_node **
    847find_va_links(struct vmap_area *va,
    848	struct rb_root *root, struct rb_node *from,
    849	struct rb_node **parent)
    850{
    851	struct vmap_area *tmp_va;
    852	struct rb_node **link;
    853
    854	if (root) {
    855		link = &root->rb_node;
    856		if (unlikely(!*link)) {
    857			*parent = NULL;
    858			return link;
    859		}
    860	} else {
    861		link = &from;
    862	}
    863
    864	/*
    865	 * Go to the bottom of the tree. When we hit the last point
    866	 * we end up with parent rb_node and correct direction, i name
    867	 * it link, where the new va->rb_node will be attached to.
    868	 */
    869	do {
    870		tmp_va = rb_entry(*link, struct vmap_area, rb_node);
    871
    872		/*
    873		 * During the traversal we also do some sanity check.
    874		 * Trigger the BUG() if there are sides(left/right)
    875		 * or full overlaps.
    876		 */
    877		if (va->va_start < tmp_va->va_end &&
    878				va->va_end <= tmp_va->va_start)
    879			link = &(*link)->rb_left;
    880		else if (va->va_end > tmp_va->va_start &&
    881				va->va_start >= tmp_va->va_end)
    882			link = &(*link)->rb_right;
    883		else {
    884			WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
    885				va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
    886
    887			return NULL;
    888		}
    889	} while (*link);
    890
    891	*parent = &tmp_va->rb_node;
    892	return link;
    893}
    894
    895static __always_inline struct list_head *
    896get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
    897{
    898	struct list_head *list;
    899
    900	if (unlikely(!parent))
    901		/*
    902		 * The red-black tree where we try to find VA neighbors
    903		 * before merging or inserting is empty, i.e. it means
    904		 * there is no free vmap space. Normally it does not
    905		 * happen but we handle this case anyway.
    906		 */
    907		return NULL;
    908
    909	list = &rb_entry(parent, struct vmap_area, rb_node)->list;
    910	return (&parent->rb_right == link ? list->next : list);
    911}
    912
    913static __always_inline void
    914link_va(struct vmap_area *va, struct rb_root *root,
    915	struct rb_node *parent, struct rb_node **link, struct list_head *head)
    916{
    917	/*
    918	 * VA is still not in the list, but we can
    919	 * identify its future previous list_head node.
    920	 */
    921	if (likely(parent)) {
    922		head = &rb_entry(parent, struct vmap_area, rb_node)->list;
    923		if (&parent->rb_right != link)
    924			head = head->prev;
    925	}
    926
    927	/* Insert to the rb-tree */
    928	rb_link_node(&va->rb_node, parent, link);
    929	if (root == &free_vmap_area_root) {
    930		/*
    931		 * Some explanation here. Just perform simple insertion
    932		 * to the tree. We do not set va->subtree_max_size to
    933		 * its current size before calling rb_insert_augmented().
    934		 * It is because of we populate the tree from the bottom
    935		 * to parent levels when the node _is_ in the tree.
    936		 *
    937		 * Therefore we set subtree_max_size to zero after insertion,
    938		 * to let __augment_tree_propagate_from() puts everything to
    939		 * the correct order later on.
    940		 */
    941		rb_insert_augmented(&va->rb_node,
    942			root, &free_vmap_area_rb_augment_cb);
    943		va->subtree_max_size = 0;
    944	} else {
    945		rb_insert_color(&va->rb_node, root);
    946	}
    947
    948	/* Address-sort this list */
    949	list_add(&va->list, head);
    950}
    951
    952static __always_inline void
    953unlink_va(struct vmap_area *va, struct rb_root *root)
    954{
    955	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
    956		return;
    957
    958	if (root == &free_vmap_area_root)
    959		rb_erase_augmented(&va->rb_node,
    960			root, &free_vmap_area_rb_augment_cb);
    961	else
    962		rb_erase(&va->rb_node, root);
    963
    964	list_del(&va->list);
    965	RB_CLEAR_NODE(&va->rb_node);
    966}
    967
    968#if DEBUG_AUGMENT_PROPAGATE_CHECK
    969/*
    970 * Gets called when remove the node and rotate.
    971 */
    972static __always_inline unsigned long
    973compute_subtree_max_size(struct vmap_area *va)
    974{
    975	return max3(va_size(va),
    976		get_subtree_max_size(va->rb_node.rb_left),
    977		get_subtree_max_size(va->rb_node.rb_right));
    978}
    979
    980static void
    981augment_tree_propagate_check(void)
    982{
    983	struct vmap_area *va;
    984	unsigned long computed_size;
    985
    986	list_for_each_entry(va, &free_vmap_area_list, list) {
    987		computed_size = compute_subtree_max_size(va);
    988		if (computed_size != va->subtree_max_size)
    989			pr_emerg("tree is corrupted: %lu, %lu\n",
    990				va_size(va), va->subtree_max_size);
    991	}
    992}
    993#endif
    994
    995/*
    996 * This function populates subtree_max_size from bottom to upper
    997 * levels starting from VA point. The propagation must be done
    998 * when VA size is modified by changing its va_start/va_end. Or
    999 * in case of newly inserting of VA to the tree.
   1000 *
   1001 * It means that __augment_tree_propagate_from() must be called:
   1002 * - After VA has been inserted to the tree(free path);
   1003 * - After VA has been shrunk(allocation path);
   1004 * - After VA has been increased(merging path).
   1005 *
   1006 * Please note that, it does not mean that upper parent nodes
   1007 * and their subtree_max_size are recalculated all the time up
   1008 * to the root node.
   1009 *
   1010 *       4--8
   1011 *        /\
   1012 *       /  \
   1013 *      /    \
   1014 *    2--2  8--8
   1015 *
   1016 * For example if we modify the node 4, shrinking it to 2, then
   1017 * no any modification is required. If we shrink the node 2 to 1
   1018 * its subtree_max_size is updated only, and set to 1. If we shrink
   1019 * the node 8 to 6, then its subtree_max_size is set to 6 and parent
   1020 * node becomes 4--6.
   1021 */
   1022static __always_inline void
   1023augment_tree_propagate_from(struct vmap_area *va)
   1024{
   1025	/*
   1026	 * Populate the tree from bottom towards the root until
   1027	 * the calculated maximum available size of checked node
   1028	 * is equal to its current one.
   1029	 */
   1030	free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
   1031
   1032#if DEBUG_AUGMENT_PROPAGATE_CHECK
   1033	augment_tree_propagate_check();
   1034#endif
   1035}
   1036
   1037static void
   1038insert_vmap_area(struct vmap_area *va,
   1039	struct rb_root *root, struct list_head *head)
   1040{
   1041	struct rb_node **link;
   1042	struct rb_node *parent;
   1043
   1044	link = find_va_links(va, root, NULL, &parent);
   1045	if (link)
   1046		link_va(va, root, parent, link, head);
   1047}
   1048
   1049static void
   1050insert_vmap_area_augment(struct vmap_area *va,
   1051	struct rb_node *from, struct rb_root *root,
   1052	struct list_head *head)
   1053{
   1054	struct rb_node **link;
   1055	struct rb_node *parent;
   1056
   1057	if (from)
   1058		link = find_va_links(va, NULL, from, &parent);
   1059	else
   1060		link = find_va_links(va, root, NULL, &parent);
   1061
   1062	if (link) {
   1063		link_va(va, root, parent, link, head);
   1064		augment_tree_propagate_from(va);
   1065	}
   1066}
   1067
   1068/*
   1069 * Merge de-allocated chunk of VA memory with previous
   1070 * and next free blocks. If coalesce is not done a new
   1071 * free area is inserted. If VA has been merged, it is
   1072 * freed.
   1073 *
   1074 * Please note, it can return NULL in case of overlap
   1075 * ranges, followed by WARN() report. Despite it is a
   1076 * buggy behaviour, a system can be alive and keep
   1077 * ongoing.
   1078 */
   1079static __always_inline struct vmap_area *
   1080merge_or_add_vmap_area(struct vmap_area *va,
   1081	struct rb_root *root, struct list_head *head)
   1082{
   1083	struct vmap_area *sibling;
   1084	struct list_head *next;
   1085	struct rb_node **link;
   1086	struct rb_node *parent;
   1087	bool merged = false;
   1088
   1089	/*
   1090	 * Find a place in the tree where VA potentially will be
   1091	 * inserted, unless it is merged with its sibling/siblings.
   1092	 */
   1093	link = find_va_links(va, root, NULL, &parent);
   1094	if (!link)
   1095		return NULL;
   1096
   1097	/*
   1098	 * Get next node of VA to check if merging can be done.
   1099	 */
   1100	next = get_va_next_sibling(parent, link);
   1101	if (unlikely(next == NULL))
   1102		goto insert;
   1103
   1104	/*
   1105	 * start            end
   1106	 * |                |
   1107	 * |<------VA------>|<-----Next----->|
   1108	 *                  |                |
   1109	 *                  start            end
   1110	 */
   1111	if (next != head) {
   1112		sibling = list_entry(next, struct vmap_area, list);
   1113		if (sibling->va_start == va->va_end) {
   1114			sibling->va_start = va->va_start;
   1115
   1116			/* Free vmap_area object. */
   1117			kmem_cache_free(vmap_area_cachep, va);
   1118
   1119			/* Point to the new merged area. */
   1120			va = sibling;
   1121			merged = true;
   1122		}
   1123	}
   1124
   1125	/*
   1126	 * start            end
   1127	 * |                |
   1128	 * |<-----Prev----->|<------VA------>|
   1129	 *                  |                |
   1130	 *                  start            end
   1131	 */
   1132	if (next->prev != head) {
   1133		sibling = list_entry(next->prev, struct vmap_area, list);
   1134		if (sibling->va_end == va->va_start) {
   1135			/*
   1136			 * If both neighbors are coalesced, it is important
   1137			 * to unlink the "next" node first, followed by merging
   1138			 * with "previous" one. Otherwise the tree might not be
   1139			 * fully populated if a sibling's augmented value is
   1140			 * "normalized" because of rotation operations.
   1141			 */
   1142			if (merged)
   1143				unlink_va(va, root);
   1144
   1145			sibling->va_end = va->va_end;
   1146
   1147			/* Free vmap_area object. */
   1148			kmem_cache_free(vmap_area_cachep, va);
   1149
   1150			/* Point to the new merged area. */
   1151			va = sibling;
   1152			merged = true;
   1153		}
   1154	}
   1155
   1156insert:
   1157	if (!merged)
   1158		link_va(va, root, parent, link, head);
   1159
   1160	return va;
   1161}
   1162
   1163static __always_inline struct vmap_area *
   1164merge_or_add_vmap_area_augment(struct vmap_area *va,
   1165	struct rb_root *root, struct list_head *head)
   1166{
   1167	va = merge_or_add_vmap_area(va, root, head);
   1168	if (va)
   1169		augment_tree_propagate_from(va);
   1170
   1171	return va;
   1172}
   1173
   1174static __always_inline bool
   1175is_within_this_va(struct vmap_area *va, unsigned long size,
   1176	unsigned long align, unsigned long vstart)
   1177{
   1178	unsigned long nva_start_addr;
   1179
   1180	if (va->va_start > vstart)
   1181		nva_start_addr = ALIGN(va->va_start, align);
   1182	else
   1183		nva_start_addr = ALIGN(vstart, align);
   1184
   1185	/* Can be overflowed due to big size or alignment. */
   1186	if (nva_start_addr + size < nva_start_addr ||
   1187			nva_start_addr < vstart)
   1188		return false;
   1189
   1190	return (nva_start_addr + size <= va->va_end);
   1191}
   1192
   1193/*
   1194 * Find the first free block(lowest start address) in the tree,
   1195 * that will accomplish the request corresponding to passing
   1196 * parameters. Please note, with an alignment bigger than PAGE_SIZE,
   1197 * a search length is adjusted to account for worst case alignment
   1198 * overhead.
   1199 */
   1200static __always_inline struct vmap_area *
   1201find_vmap_lowest_match(unsigned long size, unsigned long align,
   1202	unsigned long vstart, bool adjust_search_size)
   1203{
   1204	struct vmap_area *va;
   1205	struct rb_node *node;
   1206	unsigned long length;
   1207
   1208	/* Start from the root. */
   1209	node = free_vmap_area_root.rb_node;
   1210
   1211	/* Adjust the search size for alignment overhead. */
   1212	length = adjust_search_size ? size + align - 1 : size;
   1213
   1214	while (node) {
   1215		va = rb_entry(node, struct vmap_area, rb_node);
   1216
   1217		if (get_subtree_max_size(node->rb_left) >= length &&
   1218				vstart < va->va_start) {
   1219			node = node->rb_left;
   1220		} else {
   1221			if (is_within_this_va(va, size, align, vstart))
   1222				return va;
   1223
   1224			/*
   1225			 * Does not make sense to go deeper towards the right
   1226			 * sub-tree if it does not have a free block that is
   1227			 * equal or bigger to the requested search length.
   1228			 */
   1229			if (get_subtree_max_size(node->rb_right) >= length) {
   1230				node = node->rb_right;
   1231				continue;
   1232			}
   1233
   1234			/*
   1235			 * OK. We roll back and find the first right sub-tree,
   1236			 * that will satisfy the search criteria. It can happen
   1237			 * due to "vstart" restriction or an alignment overhead
   1238			 * that is bigger then PAGE_SIZE.
   1239			 */
   1240			while ((node = rb_parent(node))) {
   1241				va = rb_entry(node, struct vmap_area, rb_node);
   1242				if (is_within_this_va(va, size, align, vstart))
   1243					return va;
   1244
   1245				if (get_subtree_max_size(node->rb_right) >= length &&
   1246						vstart <= va->va_start) {
   1247					/*
   1248					 * Shift the vstart forward. Please note, we update it with
   1249					 * parent's start address adding "1" because we do not want
   1250					 * to enter same sub-tree after it has already been checked
   1251					 * and no suitable free block found there.
   1252					 */
   1253					vstart = va->va_start + 1;
   1254					node = node->rb_right;
   1255					break;
   1256				}
   1257			}
   1258		}
   1259	}
   1260
   1261	return NULL;
   1262}
   1263
   1264#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
   1265#include <linux/random.h>
   1266
   1267static struct vmap_area *
   1268find_vmap_lowest_linear_match(unsigned long size,
   1269	unsigned long align, unsigned long vstart)
   1270{
   1271	struct vmap_area *va;
   1272
   1273	list_for_each_entry(va, &free_vmap_area_list, list) {
   1274		if (!is_within_this_va(va, size, align, vstart))
   1275			continue;
   1276
   1277		return va;
   1278	}
   1279
   1280	return NULL;
   1281}
   1282
   1283static void
   1284find_vmap_lowest_match_check(unsigned long size, unsigned long align)
   1285{
   1286	struct vmap_area *va_1, *va_2;
   1287	unsigned long vstart;
   1288	unsigned int rnd;
   1289
   1290	get_random_bytes(&rnd, sizeof(rnd));
   1291	vstart = VMALLOC_START + rnd;
   1292
   1293	va_1 = find_vmap_lowest_match(size, align, vstart, false);
   1294	va_2 = find_vmap_lowest_linear_match(size, align, vstart);
   1295
   1296	if (va_1 != va_2)
   1297		pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
   1298			va_1, va_2, vstart);
   1299}
   1300#endif
   1301
   1302enum fit_type {
   1303	NOTHING_FIT = 0,
   1304	FL_FIT_TYPE = 1,	/* full fit */
   1305	LE_FIT_TYPE = 2,	/* left edge fit */
   1306	RE_FIT_TYPE = 3,	/* right edge fit */
   1307	NE_FIT_TYPE = 4		/* no edge fit */
   1308};
   1309
   1310static __always_inline enum fit_type
   1311classify_va_fit_type(struct vmap_area *va,
   1312	unsigned long nva_start_addr, unsigned long size)
   1313{
   1314	enum fit_type type;
   1315
   1316	/* Check if it is within VA. */
   1317	if (nva_start_addr < va->va_start ||
   1318			nva_start_addr + size > va->va_end)
   1319		return NOTHING_FIT;
   1320
   1321	/* Now classify. */
   1322	if (va->va_start == nva_start_addr) {
   1323		if (va->va_end == nva_start_addr + size)
   1324			type = FL_FIT_TYPE;
   1325		else
   1326			type = LE_FIT_TYPE;
   1327	} else if (va->va_end == nva_start_addr + size) {
   1328		type = RE_FIT_TYPE;
   1329	} else {
   1330		type = NE_FIT_TYPE;
   1331	}
   1332
   1333	return type;
   1334}
   1335
   1336static __always_inline int
   1337adjust_va_to_fit_type(struct vmap_area *va,
   1338	unsigned long nva_start_addr, unsigned long size,
   1339	enum fit_type type)
   1340{
   1341	struct vmap_area *lva = NULL;
   1342
   1343	if (type == FL_FIT_TYPE) {
   1344		/*
   1345		 * No need to split VA, it fully fits.
   1346		 *
   1347		 * |               |
   1348		 * V      NVA      V
   1349		 * |---------------|
   1350		 */
   1351		unlink_va(va, &free_vmap_area_root);
   1352		kmem_cache_free(vmap_area_cachep, va);
   1353	} else if (type == LE_FIT_TYPE) {
   1354		/*
   1355		 * Split left edge of fit VA.
   1356		 *
   1357		 * |       |
   1358		 * V  NVA  V   R
   1359		 * |-------|-------|
   1360		 */
   1361		va->va_start += size;
   1362	} else if (type == RE_FIT_TYPE) {
   1363		/*
   1364		 * Split right edge of fit VA.
   1365		 *
   1366		 *         |       |
   1367		 *     L   V  NVA  V
   1368		 * |-------|-------|
   1369		 */
   1370		va->va_end = nva_start_addr;
   1371	} else if (type == NE_FIT_TYPE) {
   1372		/*
   1373		 * Split no edge of fit VA.
   1374		 *
   1375		 *     |       |
   1376		 *   L V  NVA  V R
   1377		 * |---|-------|---|
   1378		 */
   1379		lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
   1380		if (unlikely(!lva)) {
   1381			/*
   1382			 * For percpu allocator we do not do any pre-allocation
   1383			 * and leave it as it is. The reason is it most likely
   1384			 * never ends up with NE_FIT_TYPE splitting. In case of
   1385			 * percpu allocations offsets and sizes are aligned to
   1386			 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
   1387			 * are its main fitting cases.
   1388			 *
   1389			 * There are a few exceptions though, as an example it is
   1390			 * a first allocation (early boot up) when we have "one"
   1391			 * big free space that has to be split.
   1392			 *
   1393			 * Also we can hit this path in case of regular "vmap"
   1394			 * allocations, if "this" current CPU was not preloaded.
   1395			 * See the comment in alloc_vmap_area() why. If so, then
   1396			 * GFP_NOWAIT is used instead to get an extra object for
   1397			 * split purpose. That is rare and most time does not
   1398			 * occur.
   1399			 *
   1400			 * What happens if an allocation gets failed. Basically,
   1401			 * an "overflow" path is triggered to purge lazily freed
   1402			 * areas to free some memory, then, the "retry" path is
   1403			 * triggered to repeat one more time. See more details
   1404			 * in alloc_vmap_area() function.
   1405			 */
   1406			lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
   1407			if (!lva)
   1408				return -1;
   1409		}
   1410
   1411		/*
   1412		 * Build the remainder.
   1413		 */
   1414		lva->va_start = va->va_start;
   1415		lva->va_end = nva_start_addr;
   1416
   1417		/*
   1418		 * Shrink this VA to remaining size.
   1419		 */
   1420		va->va_start = nva_start_addr + size;
   1421	} else {
   1422		return -1;
   1423	}
   1424
   1425	if (type != FL_FIT_TYPE) {
   1426		augment_tree_propagate_from(va);
   1427
   1428		if (lva)	/* type == NE_FIT_TYPE */
   1429			insert_vmap_area_augment(lva, &va->rb_node,
   1430				&free_vmap_area_root, &free_vmap_area_list);
   1431	}
   1432
   1433	return 0;
   1434}
   1435
   1436/*
   1437 * Returns a start address of the newly allocated area, if success.
   1438 * Otherwise a vend is returned that indicates failure.
   1439 */
   1440static __always_inline unsigned long
   1441__alloc_vmap_area(unsigned long size, unsigned long align,
   1442	unsigned long vstart, unsigned long vend)
   1443{
   1444	bool adjust_search_size = true;
   1445	unsigned long nva_start_addr;
   1446	struct vmap_area *va;
   1447	enum fit_type type;
   1448	int ret;
   1449
   1450	/*
   1451	 * Do not adjust when:
   1452	 *   a) align <= PAGE_SIZE, because it does not make any sense.
   1453	 *      All blocks(their start addresses) are at least PAGE_SIZE
   1454	 *      aligned anyway;
   1455	 *   b) a short range where a requested size corresponds to exactly
   1456	 *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
   1457	 *      With adjusted search length an allocation would not succeed.
   1458	 */
   1459	if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
   1460		adjust_search_size = false;
   1461
   1462	va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
   1463	if (unlikely(!va))
   1464		return vend;
   1465
   1466	if (va->va_start > vstart)
   1467		nva_start_addr = ALIGN(va->va_start, align);
   1468	else
   1469		nva_start_addr = ALIGN(vstart, align);
   1470
   1471	/* Check the "vend" restriction. */
   1472	if (nva_start_addr + size > vend)
   1473		return vend;
   1474
   1475	/* Classify what we have found. */
   1476	type = classify_va_fit_type(va, nva_start_addr, size);
   1477	if (WARN_ON_ONCE(type == NOTHING_FIT))
   1478		return vend;
   1479
   1480	/* Update the free vmap_area. */
   1481	ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
   1482	if (ret)
   1483		return vend;
   1484
   1485#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
   1486	find_vmap_lowest_match_check(size, align);
   1487#endif
   1488
   1489	return nva_start_addr;
   1490}
   1491
   1492/*
   1493 * Free a region of KVA allocated by alloc_vmap_area
   1494 */
   1495static void free_vmap_area(struct vmap_area *va)
   1496{
   1497	/*
   1498	 * Remove from the busy tree/list.
   1499	 */
   1500	spin_lock(&vmap_area_lock);
   1501	unlink_va(va, &vmap_area_root);
   1502	spin_unlock(&vmap_area_lock);
   1503
   1504	/*
   1505	 * Insert/Merge it back to the free tree/list.
   1506	 */
   1507	spin_lock(&free_vmap_area_lock);
   1508	merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
   1509	spin_unlock(&free_vmap_area_lock);
   1510}
   1511
   1512static inline void
   1513preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
   1514{
   1515	struct vmap_area *va = NULL;
   1516
   1517	/*
   1518	 * Preload this CPU with one extra vmap_area object. It is used
   1519	 * when fit type of free area is NE_FIT_TYPE. It guarantees that
   1520	 * a CPU that does an allocation is preloaded.
   1521	 *
   1522	 * We do it in non-atomic context, thus it allows us to use more
   1523	 * permissive allocation masks to be more stable under low memory
   1524	 * condition and high memory pressure.
   1525	 */
   1526	if (!this_cpu_read(ne_fit_preload_node))
   1527		va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
   1528
   1529	spin_lock(lock);
   1530
   1531	if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
   1532		kmem_cache_free(vmap_area_cachep, va);
   1533}
   1534
   1535/*
   1536 * Allocate a region of KVA of the specified size and alignment, within the
   1537 * vstart and vend.
   1538 */
   1539static struct vmap_area *alloc_vmap_area(unsigned long size,
   1540				unsigned long align,
   1541				unsigned long vstart, unsigned long vend,
   1542				int node, gfp_t gfp_mask)
   1543{
   1544	struct vmap_area *va;
   1545	unsigned long freed;
   1546	unsigned long addr;
   1547	int purged = 0;
   1548	int ret;
   1549
   1550	BUG_ON(!size);
   1551	BUG_ON(offset_in_page(size));
   1552	BUG_ON(!is_power_of_2(align));
   1553
   1554	if (unlikely(!vmap_initialized))
   1555		return ERR_PTR(-EBUSY);
   1556
   1557	might_sleep();
   1558	gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
   1559
   1560	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
   1561	if (unlikely(!va))
   1562		return ERR_PTR(-ENOMEM);
   1563
   1564	/*
   1565	 * Only scan the relevant parts containing pointers to other objects
   1566	 * to avoid false negatives.
   1567	 */
   1568	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
   1569
   1570retry:
   1571	preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
   1572	addr = __alloc_vmap_area(size, align, vstart, vend);
   1573	spin_unlock(&free_vmap_area_lock);
   1574
   1575	/*
   1576	 * If an allocation fails, the "vend" address is
   1577	 * returned. Therefore trigger the overflow path.
   1578	 */
   1579	if (unlikely(addr == vend))
   1580		goto overflow;
   1581
   1582	va->va_start = addr;
   1583	va->va_end = addr + size;
   1584	va->vm = NULL;
   1585
   1586	spin_lock(&vmap_area_lock);
   1587	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
   1588	spin_unlock(&vmap_area_lock);
   1589
   1590	BUG_ON(!IS_ALIGNED(va->va_start, align));
   1591	BUG_ON(va->va_start < vstart);
   1592	BUG_ON(va->va_end > vend);
   1593
   1594	ret = kasan_populate_vmalloc(addr, size);
   1595	if (ret) {
   1596		free_vmap_area(va);
   1597		return ERR_PTR(ret);
   1598	}
   1599
   1600	return va;
   1601
   1602overflow:
   1603	if (!purged) {
   1604		purge_vmap_area_lazy();
   1605		purged = 1;
   1606		goto retry;
   1607	}
   1608
   1609	freed = 0;
   1610	blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
   1611
   1612	if (freed > 0) {
   1613		purged = 0;
   1614		goto retry;
   1615	}
   1616
   1617	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
   1618		pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
   1619			size);
   1620
   1621	kmem_cache_free(vmap_area_cachep, va);
   1622	return ERR_PTR(-EBUSY);
   1623}
   1624
   1625int register_vmap_purge_notifier(struct notifier_block *nb)
   1626{
   1627	return blocking_notifier_chain_register(&vmap_notify_list, nb);
   1628}
   1629EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
   1630
   1631int unregister_vmap_purge_notifier(struct notifier_block *nb)
   1632{
   1633	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
   1634}
   1635EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
   1636
   1637/*
   1638 * lazy_max_pages is the maximum amount of virtual address space we gather up
   1639 * before attempting to purge with a TLB flush.
   1640 *
   1641 * There is a tradeoff here: a larger number will cover more kernel page tables
   1642 * and take slightly longer to purge, but it will linearly reduce the number of
   1643 * global TLB flushes that must be performed. It would seem natural to scale
   1644 * this number up linearly with the number of CPUs (because vmapping activity
   1645 * could also scale linearly with the number of CPUs), however it is likely
   1646 * that in practice, workloads might be constrained in other ways that mean
   1647 * vmap activity will not scale linearly with CPUs. Also, I want to be
   1648 * conservative and not introduce a big latency on huge systems, so go with
   1649 * a less aggressive log scale. It will still be an improvement over the old
   1650 * code, and it will be simple to change the scale factor if we find that it
   1651 * becomes a problem on bigger systems.
   1652 */
   1653static unsigned long lazy_max_pages(void)
   1654{
   1655	unsigned int log;
   1656
   1657	log = fls(num_online_cpus());
   1658
   1659	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
   1660}
   1661
   1662static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
   1663
   1664/*
   1665 * Serialize vmap purging.  There is no actual critical section protected
   1666 * by this look, but we want to avoid concurrent calls for performance
   1667 * reasons and to make the pcpu_get_vm_areas more deterministic.
   1668 */
   1669static DEFINE_MUTEX(vmap_purge_lock);
   1670
   1671/* for per-CPU blocks */
   1672static void purge_fragmented_blocks_allcpus(void);
   1673
   1674/*
   1675 * Purges all lazily-freed vmap areas.
   1676 */
   1677static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
   1678{
   1679	unsigned long resched_threshold;
   1680	struct list_head local_pure_list;
   1681	struct vmap_area *va, *n_va;
   1682
   1683	lockdep_assert_held(&vmap_purge_lock);
   1684
   1685	spin_lock(&purge_vmap_area_lock);
   1686	purge_vmap_area_root = RB_ROOT;
   1687	list_replace_init(&purge_vmap_area_list, &local_pure_list);
   1688	spin_unlock(&purge_vmap_area_lock);
   1689
   1690	if (unlikely(list_empty(&local_pure_list)))
   1691		return false;
   1692
   1693	start = min(start,
   1694		list_first_entry(&local_pure_list,
   1695			struct vmap_area, list)->va_start);
   1696
   1697	end = max(end,
   1698		list_last_entry(&local_pure_list,
   1699			struct vmap_area, list)->va_end);
   1700
   1701	flush_tlb_kernel_range(start, end);
   1702	resched_threshold = lazy_max_pages() << 1;
   1703
   1704	spin_lock(&free_vmap_area_lock);
   1705	list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
   1706		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
   1707		unsigned long orig_start = va->va_start;
   1708		unsigned long orig_end = va->va_end;
   1709
   1710		/*
   1711		 * Finally insert or merge lazily-freed area. It is
   1712		 * detached and there is no need to "unlink" it from
   1713		 * anything.
   1714		 */
   1715		va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
   1716				&free_vmap_area_list);
   1717
   1718		if (!va)
   1719			continue;
   1720
   1721		if (is_vmalloc_or_module_addr((void *)orig_start))
   1722			kasan_release_vmalloc(orig_start, orig_end,
   1723					      va->va_start, va->va_end);
   1724
   1725		atomic_long_sub(nr, &vmap_lazy_nr);
   1726
   1727		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
   1728			cond_resched_lock(&free_vmap_area_lock);
   1729	}
   1730	spin_unlock(&free_vmap_area_lock);
   1731	return true;
   1732}
   1733
   1734/*
   1735 * Kick off a purge of the outstanding lazy areas.
   1736 */
   1737static void purge_vmap_area_lazy(void)
   1738{
   1739	mutex_lock(&vmap_purge_lock);
   1740	purge_fragmented_blocks_allcpus();
   1741	__purge_vmap_area_lazy(ULONG_MAX, 0);
   1742	mutex_unlock(&vmap_purge_lock);
   1743}
   1744
   1745static void drain_vmap_area_work(struct work_struct *work)
   1746{
   1747	unsigned long nr_lazy;
   1748
   1749	do {
   1750		mutex_lock(&vmap_purge_lock);
   1751		__purge_vmap_area_lazy(ULONG_MAX, 0);
   1752		mutex_unlock(&vmap_purge_lock);
   1753
   1754		/* Recheck if further work is required. */
   1755		nr_lazy = atomic_long_read(&vmap_lazy_nr);
   1756	} while (nr_lazy > lazy_max_pages());
   1757}
   1758
   1759/*
   1760 * Free a vmap area, caller ensuring that the area has been unmapped
   1761 * and flush_cache_vunmap had been called for the correct range
   1762 * previously.
   1763 */
   1764static void free_vmap_area_noflush(struct vmap_area *va)
   1765{
   1766	unsigned long nr_lazy;
   1767
   1768	spin_lock(&vmap_area_lock);
   1769	unlink_va(va, &vmap_area_root);
   1770	spin_unlock(&vmap_area_lock);
   1771
   1772	nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
   1773				PAGE_SHIFT, &vmap_lazy_nr);
   1774
   1775	/*
   1776	 * Merge or place it to the purge tree/list.
   1777	 */
   1778	spin_lock(&purge_vmap_area_lock);
   1779	merge_or_add_vmap_area(va,
   1780		&purge_vmap_area_root, &purge_vmap_area_list);
   1781	spin_unlock(&purge_vmap_area_lock);
   1782
   1783	/* After this point, we may free va at any time */
   1784	if (unlikely(nr_lazy > lazy_max_pages()))
   1785		schedule_work(&drain_vmap_work);
   1786}
   1787
   1788/*
   1789 * Free and unmap a vmap area
   1790 */
   1791static void free_unmap_vmap_area(struct vmap_area *va)
   1792{
   1793	flush_cache_vunmap(va->va_start, va->va_end);
   1794	vunmap_range_noflush(va->va_start, va->va_end);
   1795	if (debug_pagealloc_enabled_static())
   1796		flush_tlb_kernel_range(va->va_start, va->va_end);
   1797
   1798	free_vmap_area_noflush(va);
   1799}
   1800
   1801struct vmap_area *find_vmap_area(unsigned long addr)
   1802{
   1803	struct vmap_area *va;
   1804
   1805	spin_lock(&vmap_area_lock);
   1806	va = __find_vmap_area(addr);
   1807	spin_unlock(&vmap_area_lock);
   1808
   1809	return va;
   1810}
   1811
   1812/*** Per cpu kva allocator ***/
   1813
   1814/*
   1815 * vmap space is limited especially on 32 bit architectures. Ensure there is
   1816 * room for at least 16 percpu vmap blocks per CPU.
   1817 */
   1818/*
   1819 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
   1820 * to #define VMALLOC_SPACE		(VMALLOC_END-VMALLOC_START). Guess
   1821 * instead (we just need a rough idea)
   1822 */
   1823#if BITS_PER_LONG == 32
   1824#define VMALLOC_SPACE		(128UL*1024*1024)
   1825#else
   1826#define VMALLOC_SPACE		(128UL*1024*1024*1024)
   1827#endif
   1828
   1829#define VMALLOC_PAGES		(VMALLOC_SPACE / PAGE_SIZE)
   1830#define VMAP_MAX_ALLOC		BITS_PER_LONG	/* 256K with 4K pages */
   1831#define VMAP_BBMAP_BITS_MAX	1024	/* 4MB with 4K pages */
   1832#define VMAP_BBMAP_BITS_MIN	(VMAP_MAX_ALLOC*2)
   1833#define VMAP_MIN(x, y)		((x) < (y) ? (x) : (y)) /* can't use min() */
   1834#define VMAP_MAX(x, y)		((x) > (y) ? (x) : (y)) /* can't use max() */
   1835#define VMAP_BBMAP_BITS		\
   1836		VMAP_MIN(VMAP_BBMAP_BITS_MAX,	\
   1837		VMAP_MAX(VMAP_BBMAP_BITS_MIN,	\
   1838			VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
   1839
   1840#define VMAP_BLOCK_SIZE		(VMAP_BBMAP_BITS * PAGE_SIZE)
   1841
   1842struct vmap_block_queue {
   1843	spinlock_t lock;
   1844	struct list_head free;
   1845};
   1846
   1847struct vmap_block {
   1848	spinlock_t lock;
   1849	struct vmap_area *va;
   1850	unsigned long free, dirty;
   1851	unsigned long dirty_min, dirty_max; /*< dirty range */
   1852	struct list_head free_list;
   1853	struct rcu_head rcu_head;
   1854	struct list_head purge;
   1855};
   1856
   1857/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
   1858static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
   1859
   1860/*
   1861 * XArray of vmap blocks, indexed by address, to quickly find a vmap block
   1862 * in the free path. Could get rid of this if we change the API to return a
   1863 * "cookie" from alloc, to be passed to free. But no big deal yet.
   1864 */
   1865static DEFINE_XARRAY(vmap_blocks);
   1866
   1867/*
   1868 * We should probably have a fallback mechanism to allocate virtual memory
   1869 * out of partially filled vmap blocks. However vmap block sizing should be
   1870 * fairly reasonable according to the vmalloc size, so it shouldn't be a
   1871 * big problem.
   1872 */
   1873
   1874static unsigned long addr_to_vb_idx(unsigned long addr)
   1875{
   1876	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
   1877	addr /= VMAP_BLOCK_SIZE;
   1878	return addr;
   1879}
   1880
   1881static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
   1882{
   1883	unsigned long addr;
   1884
   1885	addr = va_start + (pages_off << PAGE_SHIFT);
   1886	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
   1887	return (void *)addr;
   1888}
   1889
   1890/**
   1891 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
   1892 *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
   1893 * @order:    how many 2^order pages should be occupied in newly allocated block
   1894 * @gfp_mask: flags for the page level allocator
   1895 *
   1896 * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
   1897 */
   1898static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
   1899{
   1900	struct vmap_block_queue *vbq;
   1901	struct vmap_block *vb;
   1902	struct vmap_area *va;
   1903	unsigned long vb_idx;
   1904	int node, err;
   1905	void *vaddr;
   1906
   1907	node = numa_node_id();
   1908
   1909	vb = kmalloc_node(sizeof(struct vmap_block),
   1910			gfp_mask & GFP_RECLAIM_MASK, node);
   1911	if (unlikely(!vb))
   1912		return ERR_PTR(-ENOMEM);
   1913
   1914	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
   1915					VMALLOC_START, VMALLOC_END,
   1916					node, gfp_mask);
   1917	if (IS_ERR(va)) {
   1918		kfree(vb);
   1919		return ERR_CAST(va);
   1920	}
   1921
   1922	vaddr = vmap_block_vaddr(va->va_start, 0);
   1923	spin_lock_init(&vb->lock);
   1924	vb->va = va;
   1925	/* At least something should be left free */
   1926	BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
   1927	vb->free = VMAP_BBMAP_BITS - (1UL << order);
   1928	vb->dirty = 0;
   1929	vb->dirty_min = VMAP_BBMAP_BITS;
   1930	vb->dirty_max = 0;
   1931	INIT_LIST_HEAD(&vb->free_list);
   1932
   1933	vb_idx = addr_to_vb_idx(va->va_start);
   1934	err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
   1935	if (err) {
   1936		kfree(vb);
   1937		free_vmap_area(va);
   1938		return ERR_PTR(err);
   1939	}
   1940
   1941	vbq = raw_cpu_ptr(&vmap_block_queue);
   1942	spin_lock(&vbq->lock);
   1943	list_add_tail_rcu(&vb->free_list, &vbq->free);
   1944	spin_unlock(&vbq->lock);
   1945
   1946	return vaddr;
   1947}
   1948
   1949static void free_vmap_block(struct vmap_block *vb)
   1950{
   1951	struct vmap_block *tmp;
   1952
   1953	tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
   1954	BUG_ON(tmp != vb);
   1955
   1956	free_vmap_area_noflush(vb->va);
   1957	kfree_rcu(vb, rcu_head);
   1958}
   1959
   1960static void purge_fragmented_blocks(int cpu)
   1961{
   1962	LIST_HEAD(purge);
   1963	struct vmap_block *vb;
   1964	struct vmap_block *n_vb;
   1965	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
   1966
   1967	rcu_read_lock();
   1968	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
   1969
   1970		if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
   1971			continue;
   1972
   1973		spin_lock(&vb->lock);
   1974		if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
   1975			vb->free = 0; /* prevent further allocs after releasing lock */
   1976			vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
   1977			vb->dirty_min = 0;
   1978			vb->dirty_max = VMAP_BBMAP_BITS;
   1979			spin_lock(&vbq->lock);
   1980			list_del_rcu(&vb->free_list);
   1981			spin_unlock(&vbq->lock);
   1982			spin_unlock(&vb->lock);
   1983			list_add_tail(&vb->purge, &purge);
   1984		} else
   1985			spin_unlock(&vb->lock);
   1986	}
   1987	rcu_read_unlock();
   1988
   1989	list_for_each_entry_safe(vb, n_vb, &purge, purge) {
   1990		list_del(&vb->purge);
   1991		free_vmap_block(vb);
   1992	}
   1993}
   1994
   1995static void purge_fragmented_blocks_allcpus(void)
   1996{
   1997	int cpu;
   1998
   1999	for_each_possible_cpu(cpu)
   2000		purge_fragmented_blocks(cpu);
   2001}
   2002
   2003static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
   2004{
   2005	struct vmap_block_queue *vbq;
   2006	struct vmap_block *vb;
   2007	void *vaddr = NULL;
   2008	unsigned int order;
   2009
   2010	BUG_ON(offset_in_page(size));
   2011	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
   2012	if (WARN_ON(size == 0)) {
   2013		/*
   2014		 * Allocating 0 bytes isn't what caller wants since
   2015		 * get_order(0) returns funny result. Just warn and terminate
   2016		 * early.
   2017		 */
   2018		return NULL;
   2019	}
   2020	order = get_order(size);
   2021
   2022	rcu_read_lock();
   2023	vbq = raw_cpu_ptr(&vmap_block_queue);
   2024	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
   2025		unsigned long pages_off;
   2026
   2027		spin_lock(&vb->lock);
   2028		if (vb->free < (1UL << order)) {
   2029			spin_unlock(&vb->lock);
   2030			continue;
   2031		}
   2032
   2033		pages_off = VMAP_BBMAP_BITS - vb->free;
   2034		vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
   2035		vb->free -= 1UL << order;
   2036		if (vb->free == 0) {
   2037			spin_lock(&vbq->lock);
   2038			list_del_rcu(&vb->free_list);
   2039			spin_unlock(&vbq->lock);
   2040		}
   2041
   2042		spin_unlock(&vb->lock);
   2043		break;
   2044	}
   2045
   2046	rcu_read_unlock();
   2047
   2048	/* Allocate new block if nothing was found */
   2049	if (!vaddr)
   2050		vaddr = new_vmap_block(order, gfp_mask);
   2051
   2052	return vaddr;
   2053}
   2054
   2055static void vb_free(unsigned long addr, unsigned long size)
   2056{
   2057	unsigned long offset;
   2058	unsigned int order;
   2059	struct vmap_block *vb;
   2060
   2061	BUG_ON(offset_in_page(size));
   2062	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
   2063
   2064	flush_cache_vunmap(addr, addr + size);
   2065
   2066	order = get_order(size);
   2067	offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
   2068	vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
   2069
   2070	vunmap_range_noflush(addr, addr + size);
   2071
   2072	if (debug_pagealloc_enabled_static())
   2073		flush_tlb_kernel_range(addr, addr + size);
   2074
   2075	spin_lock(&vb->lock);
   2076
   2077	/* Expand dirty range */
   2078	vb->dirty_min = min(vb->dirty_min, offset);
   2079	vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
   2080
   2081	vb->dirty += 1UL << order;
   2082	if (vb->dirty == VMAP_BBMAP_BITS) {
   2083		BUG_ON(vb->free);
   2084		spin_unlock(&vb->lock);
   2085		free_vmap_block(vb);
   2086	} else
   2087		spin_unlock(&vb->lock);
   2088}
   2089
   2090static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
   2091{
   2092	int cpu;
   2093
   2094	if (unlikely(!vmap_initialized))
   2095		return;
   2096
   2097	might_sleep();
   2098
   2099	for_each_possible_cpu(cpu) {
   2100		struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
   2101		struct vmap_block *vb;
   2102
   2103		rcu_read_lock();
   2104		list_for_each_entry_rcu(vb, &vbq->free, free_list) {
   2105			spin_lock(&vb->lock);
   2106			if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
   2107				unsigned long va_start = vb->va->va_start;
   2108				unsigned long s, e;
   2109
   2110				s = va_start + (vb->dirty_min << PAGE_SHIFT);
   2111				e = va_start + (vb->dirty_max << PAGE_SHIFT);
   2112
   2113				start = min(s, start);
   2114				end   = max(e, end);
   2115
   2116				flush = 1;
   2117			}
   2118			spin_unlock(&vb->lock);
   2119		}
   2120		rcu_read_unlock();
   2121	}
   2122
   2123	mutex_lock(&vmap_purge_lock);
   2124	purge_fragmented_blocks_allcpus();
   2125	if (!__purge_vmap_area_lazy(start, end) && flush)
   2126		flush_tlb_kernel_range(start, end);
   2127	mutex_unlock(&vmap_purge_lock);
   2128}
   2129
   2130/**
   2131 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
   2132 *
   2133 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
   2134 * to amortize TLB flushing overheads. What this means is that any page you
   2135 * have now, may, in a former life, have been mapped into kernel virtual
   2136 * address by the vmap layer and so there might be some CPUs with TLB entries
   2137 * still referencing that page (additional to the regular 1:1 kernel mapping).
   2138 *
   2139 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
   2140 * be sure that none of the pages we have control over will have any aliases
   2141 * from the vmap layer.
   2142 */
   2143void vm_unmap_aliases(void)
   2144{
   2145	unsigned long start = ULONG_MAX, end = 0;
   2146	int flush = 0;
   2147
   2148	_vm_unmap_aliases(start, end, flush);
   2149}
   2150EXPORT_SYMBOL_GPL(vm_unmap_aliases);
   2151
   2152/**
   2153 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
   2154 * @mem: the pointer returned by vm_map_ram
   2155 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
   2156 */
   2157void vm_unmap_ram(const void *mem, unsigned int count)
   2158{
   2159	unsigned long size = (unsigned long)count << PAGE_SHIFT;
   2160	unsigned long addr = (unsigned long)kasan_reset_tag(mem);
   2161	struct vmap_area *va;
   2162
   2163	might_sleep();
   2164	BUG_ON(!addr);
   2165	BUG_ON(addr < VMALLOC_START);
   2166	BUG_ON(addr > VMALLOC_END);
   2167	BUG_ON(!PAGE_ALIGNED(addr));
   2168
   2169	kasan_poison_vmalloc(mem, size);
   2170
   2171	if (likely(count <= VMAP_MAX_ALLOC)) {
   2172		debug_check_no_locks_freed(mem, size);
   2173		vb_free(addr, size);
   2174		return;
   2175	}
   2176
   2177	va = find_vmap_area(addr);
   2178	BUG_ON(!va);
   2179	debug_check_no_locks_freed((void *)va->va_start,
   2180				    (va->va_end - va->va_start));
   2181	free_unmap_vmap_area(va);
   2182}
   2183EXPORT_SYMBOL(vm_unmap_ram);
   2184
   2185/**
   2186 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
   2187 * @pages: an array of pointers to the pages to be mapped
   2188 * @count: number of pages
   2189 * @node: prefer to allocate data structures on this node
   2190 *
   2191 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
   2192 * faster than vmap so it's good.  But if you mix long-life and short-life
   2193 * objects with vm_map_ram(), it could consume lots of address space through
   2194 * fragmentation (especially on a 32bit machine).  You could see failures in
   2195 * the end.  Please use this function for short-lived objects.
   2196 *
   2197 * Returns: a pointer to the address that has been mapped, or %NULL on failure
   2198 */
   2199void *vm_map_ram(struct page **pages, unsigned int count, int node)
   2200{
   2201	unsigned long size = (unsigned long)count << PAGE_SHIFT;
   2202	unsigned long addr;
   2203	void *mem;
   2204
   2205	if (likely(count <= VMAP_MAX_ALLOC)) {
   2206		mem = vb_alloc(size, GFP_KERNEL);
   2207		if (IS_ERR(mem))
   2208			return NULL;
   2209		addr = (unsigned long)mem;
   2210	} else {
   2211		struct vmap_area *va;
   2212		va = alloc_vmap_area(size, PAGE_SIZE,
   2213				VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
   2214		if (IS_ERR(va))
   2215			return NULL;
   2216
   2217		addr = va->va_start;
   2218		mem = (void *)addr;
   2219	}
   2220
   2221	if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
   2222				pages, PAGE_SHIFT) < 0) {
   2223		vm_unmap_ram(mem, count);
   2224		return NULL;
   2225	}
   2226
   2227	/*
   2228	 * Mark the pages as accessible, now that they are mapped.
   2229	 * With hardware tag-based KASAN, marking is skipped for
   2230	 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
   2231	 */
   2232	mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
   2233
   2234	return mem;
   2235}
   2236EXPORT_SYMBOL(vm_map_ram);
   2237
   2238static struct vm_struct *vmlist __initdata;
   2239
   2240static inline unsigned int vm_area_page_order(struct vm_struct *vm)
   2241{
   2242#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
   2243	return vm->page_order;
   2244#else
   2245	return 0;
   2246#endif
   2247}
   2248
   2249static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
   2250{
   2251#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
   2252	vm->page_order = order;
   2253#else
   2254	BUG_ON(order != 0);
   2255#endif
   2256}
   2257
   2258/**
   2259 * vm_area_add_early - add vmap area early during boot
   2260 * @vm: vm_struct to add
   2261 *
   2262 * This function is used to add fixed kernel vm area to vmlist before
   2263 * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
   2264 * should contain proper values and the other fields should be zero.
   2265 *
   2266 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
   2267 */
   2268void __init vm_area_add_early(struct vm_struct *vm)
   2269{
   2270	struct vm_struct *tmp, **p;
   2271
   2272	BUG_ON(vmap_initialized);
   2273	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
   2274		if (tmp->addr >= vm->addr) {
   2275			BUG_ON(tmp->addr < vm->addr + vm->size);
   2276			break;
   2277		} else
   2278			BUG_ON(tmp->addr + tmp->size > vm->addr);
   2279	}
   2280	vm->next = *p;
   2281	*p = vm;
   2282}
   2283
   2284/**
   2285 * vm_area_register_early - register vmap area early during boot
   2286 * @vm: vm_struct to register
   2287 * @align: requested alignment
   2288 *
   2289 * This function is used to register kernel vm area before
   2290 * vmalloc_init() is called.  @vm->size and @vm->flags should contain
   2291 * proper values on entry and other fields should be zero.  On return,
   2292 * vm->addr contains the allocated address.
   2293 *
   2294 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
   2295 */
   2296void __init vm_area_register_early(struct vm_struct *vm, size_t align)
   2297{
   2298	unsigned long addr = ALIGN(VMALLOC_START, align);
   2299	struct vm_struct *cur, **p;
   2300
   2301	BUG_ON(vmap_initialized);
   2302
   2303	for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
   2304		if ((unsigned long)cur->addr - addr >= vm->size)
   2305			break;
   2306		addr = ALIGN((unsigned long)cur->addr + cur->size, align);
   2307	}
   2308
   2309	BUG_ON(addr > VMALLOC_END - vm->size);
   2310	vm->addr = (void *)addr;
   2311	vm->next = *p;
   2312	*p = vm;
   2313	kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
   2314}
   2315
   2316static void vmap_init_free_space(void)
   2317{
   2318	unsigned long vmap_start = 1;
   2319	const unsigned long vmap_end = ULONG_MAX;
   2320	struct vmap_area *busy, *free;
   2321
   2322	/*
   2323	 *     B     F     B     B     B     F
   2324	 * -|-----|.....|-----|-----|-----|.....|-
   2325	 *  |           The KVA space           |
   2326	 *  |<--------------------------------->|
   2327	 */
   2328	list_for_each_entry(busy, &vmap_area_list, list) {
   2329		if (busy->va_start - vmap_start > 0) {
   2330			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
   2331			if (!WARN_ON_ONCE(!free)) {
   2332				free->va_start = vmap_start;
   2333				free->va_end = busy->va_start;
   2334
   2335				insert_vmap_area_augment(free, NULL,
   2336					&free_vmap_area_root,
   2337						&free_vmap_area_list);
   2338			}
   2339		}
   2340
   2341		vmap_start = busy->va_end;
   2342	}
   2343
   2344	if (vmap_end - vmap_start > 0) {
   2345		free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
   2346		if (!WARN_ON_ONCE(!free)) {
   2347			free->va_start = vmap_start;
   2348			free->va_end = vmap_end;
   2349
   2350			insert_vmap_area_augment(free, NULL,
   2351				&free_vmap_area_root,
   2352					&free_vmap_area_list);
   2353		}
   2354	}
   2355}
   2356
   2357void __init vmalloc_init(void)
   2358{
   2359	struct vmap_area *va;
   2360	struct vm_struct *tmp;
   2361	int i;
   2362
   2363	/*
   2364	 * Create the cache for vmap_area objects.
   2365	 */
   2366	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
   2367
   2368	for_each_possible_cpu(i) {
   2369		struct vmap_block_queue *vbq;
   2370		struct vfree_deferred *p;
   2371
   2372		vbq = &per_cpu(vmap_block_queue, i);
   2373		spin_lock_init(&vbq->lock);
   2374		INIT_LIST_HEAD(&vbq->free);
   2375		p = &per_cpu(vfree_deferred, i);
   2376		init_llist_head(&p->list);
   2377		INIT_WORK(&p->wq, free_work);
   2378	}
   2379
   2380	/* Import existing vmlist entries. */
   2381	for (tmp = vmlist; tmp; tmp = tmp->next) {
   2382		va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
   2383		if (WARN_ON_ONCE(!va))
   2384			continue;
   2385
   2386		va->va_start = (unsigned long)tmp->addr;
   2387		va->va_end = va->va_start + tmp->size;
   2388		va->vm = tmp;
   2389		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
   2390	}
   2391
   2392	/*
   2393	 * Now we can initialize a free vmap space.
   2394	 */
   2395	vmap_init_free_space();
   2396	vmap_initialized = true;
   2397}
   2398
   2399static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
   2400	struct vmap_area *va, unsigned long flags, const void *caller)
   2401{
   2402	vm->flags = flags;
   2403	vm->addr = (void *)va->va_start;
   2404	vm->size = va->va_end - va->va_start;
   2405	vm->caller = caller;
   2406	va->vm = vm;
   2407}
   2408
   2409static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
   2410			      unsigned long flags, const void *caller)
   2411{
   2412	spin_lock(&vmap_area_lock);
   2413	setup_vmalloc_vm_locked(vm, va, flags, caller);
   2414	spin_unlock(&vmap_area_lock);
   2415}
   2416
   2417static void clear_vm_uninitialized_flag(struct vm_struct *vm)
   2418{
   2419	/*
   2420	 * Before removing VM_UNINITIALIZED,
   2421	 * we should make sure that vm has proper values.
   2422	 * Pair with smp_rmb() in show_numa_info().
   2423	 */
   2424	smp_wmb();
   2425	vm->flags &= ~VM_UNINITIALIZED;
   2426}
   2427
   2428static struct vm_struct *__get_vm_area_node(unsigned long size,
   2429		unsigned long align, unsigned long shift, unsigned long flags,
   2430		unsigned long start, unsigned long end, int node,
   2431		gfp_t gfp_mask, const void *caller)
   2432{
   2433	struct vmap_area *va;
   2434	struct vm_struct *area;
   2435	unsigned long requested_size = size;
   2436
   2437	BUG_ON(in_interrupt());
   2438	size = ALIGN(size, 1ul << shift);
   2439	if (unlikely(!size))
   2440		return NULL;
   2441
   2442	if (flags & VM_IOREMAP)
   2443		align = 1ul << clamp_t(int, get_count_order_long(size),
   2444				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
   2445
   2446	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
   2447	if (unlikely(!area))
   2448		return NULL;
   2449
   2450	if (!(flags & VM_NO_GUARD))
   2451		size += PAGE_SIZE;
   2452
   2453	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
   2454	if (IS_ERR(va)) {
   2455		kfree(area);
   2456		return NULL;
   2457	}
   2458
   2459	setup_vmalloc_vm(area, va, flags, caller);
   2460
   2461	/*
   2462	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
   2463	 * best-effort approach, as they can be mapped outside of vmalloc code.
   2464	 * For VM_ALLOC mappings, the pages are marked as accessible after
   2465	 * getting mapped in __vmalloc_node_range().
   2466	 * With hardware tag-based KASAN, marking is skipped for
   2467	 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
   2468	 */
   2469	if (!(flags & VM_ALLOC))
   2470		area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
   2471						    KASAN_VMALLOC_PROT_NORMAL);
   2472
   2473	return area;
   2474}
   2475
   2476struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
   2477				       unsigned long start, unsigned long end,
   2478				       const void *caller)
   2479{
   2480	return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
   2481				  NUMA_NO_NODE, GFP_KERNEL, caller);
   2482}
   2483
   2484/**
   2485 * get_vm_area - reserve a contiguous kernel virtual area
   2486 * @size:	 size of the area
   2487 * @flags:	 %VM_IOREMAP for I/O mappings or VM_ALLOC
   2488 *
   2489 * Search an area of @size in the kernel virtual mapping area,
   2490 * and reserved it for out purposes.  Returns the area descriptor
   2491 * on success or %NULL on failure.
   2492 *
   2493 * Return: the area descriptor on success or %NULL on failure.
   2494 */
   2495struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
   2496{
   2497	return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
   2498				  VMALLOC_START, VMALLOC_END,
   2499				  NUMA_NO_NODE, GFP_KERNEL,
   2500				  __builtin_return_address(0));
   2501}
   2502
   2503struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
   2504				const void *caller)
   2505{
   2506	return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
   2507				  VMALLOC_START, VMALLOC_END,
   2508				  NUMA_NO_NODE, GFP_KERNEL, caller);
   2509}
   2510
   2511/**
   2512 * find_vm_area - find a continuous kernel virtual area
   2513 * @addr:	  base address
   2514 *
   2515 * Search for the kernel VM area starting at @addr, and return it.
   2516 * It is up to the caller to do all required locking to keep the returned
   2517 * pointer valid.
   2518 *
   2519 * Return: the area descriptor on success or %NULL on failure.
   2520 */
   2521struct vm_struct *find_vm_area(const void *addr)
   2522{
   2523	struct vmap_area *va;
   2524
   2525	va = find_vmap_area((unsigned long)addr);
   2526	if (!va)
   2527		return NULL;
   2528
   2529	return va->vm;
   2530}
   2531
   2532/**
   2533 * remove_vm_area - find and remove a continuous kernel virtual area
   2534 * @addr:	    base address
   2535 *
   2536 * Search for the kernel VM area starting at @addr, and remove it.
   2537 * This function returns the found VM area, but using it is NOT safe
   2538 * on SMP machines, except for its size or flags.
   2539 *
   2540 * Return: the area descriptor on success or %NULL on failure.
   2541 */
   2542struct vm_struct *remove_vm_area(const void *addr)
   2543{
   2544	struct vmap_area *va;
   2545
   2546	might_sleep();
   2547
   2548	spin_lock(&vmap_area_lock);
   2549	va = __find_vmap_area((unsigned long)addr);
   2550	if (va && va->vm) {
   2551		struct vm_struct *vm = va->vm;
   2552
   2553		va->vm = NULL;
   2554		spin_unlock(&vmap_area_lock);
   2555
   2556		kasan_free_module_shadow(vm);
   2557		free_unmap_vmap_area(va);
   2558
   2559		return vm;
   2560	}
   2561
   2562	spin_unlock(&vmap_area_lock);
   2563	return NULL;
   2564}
   2565
   2566static inline void set_area_direct_map(const struct vm_struct *area,
   2567				       int (*set_direct_map)(struct page *page))
   2568{
   2569	int i;
   2570
   2571	/* HUGE_VMALLOC passes small pages to set_direct_map */
   2572	for (i = 0; i < area->nr_pages; i++)
   2573		if (page_address(area->pages[i]))
   2574			set_direct_map(area->pages[i]);
   2575}
   2576
   2577/* Handle removing and resetting vm mappings related to the vm_struct. */
   2578static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
   2579{
   2580	unsigned long start = ULONG_MAX, end = 0;
   2581	unsigned int page_order = vm_area_page_order(area);
   2582	int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
   2583	int flush_dmap = 0;
   2584	int i;
   2585
   2586	remove_vm_area(area->addr);
   2587
   2588	/* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
   2589	if (!flush_reset)
   2590		return;
   2591
   2592	/*
   2593	 * If not deallocating pages, just do the flush of the VM area and
   2594	 * return.
   2595	 */
   2596	if (!deallocate_pages) {
   2597		vm_unmap_aliases();
   2598		return;
   2599	}
   2600
   2601	/*
   2602	 * If execution gets here, flush the vm mapping and reset the direct
   2603	 * map. Find the start and end range of the direct mappings to make sure
   2604	 * the vm_unmap_aliases() flush includes the direct map.
   2605	 */
   2606	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
   2607		unsigned long addr = (unsigned long)page_address(area->pages[i]);
   2608		if (addr) {
   2609			unsigned long page_size;
   2610
   2611			page_size = PAGE_SIZE << page_order;
   2612			start = min(addr, start);
   2613			end = max(addr + page_size, end);
   2614			flush_dmap = 1;
   2615		}
   2616	}
   2617
   2618	/*
   2619	 * Set direct map to something invalid so that it won't be cached if
   2620	 * there are any accesses after the TLB flush, then flush the TLB and
   2621	 * reset the direct map permissions to the default.
   2622	 */
   2623	set_area_direct_map(area, set_direct_map_invalid_noflush);
   2624	_vm_unmap_aliases(start, end, flush_dmap);
   2625	set_area_direct_map(area, set_direct_map_default_noflush);
   2626}
   2627
   2628static void __vunmap(const void *addr, int deallocate_pages)
   2629{
   2630	struct vm_struct *area;
   2631
   2632	if (!addr)
   2633		return;
   2634
   2635	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
   2636			addr))
   2637		return;
   2638
   2639	area = find_vm_area(addr);
   2640	if (unlikely(!area)) {
   2641		WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
   2642				addr);
   2643		return;
   2644	}
   2645
   2646	debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
   2647	debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
   2648
   2649	kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
   2650
   2651	vm_remove_mappings(area, deallocate_pages);
   2652
   2653	if (deallocate_pages) {
   2654		int i;
   2655
   2656		for (i = 0; i < area->nr_pages; i++) {
   2657			struct page *page = area->pages[i];
   2658
   2659			BUG_ON(!page);
   2660			mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
   2661			/*
   2662			 * High-order allocs for huge vmallocs are split, so
   2663			 * can be freed as an array of order-0 allocations
   2664			 */
   2665			__free_pages(page, 0);
   2666			cond_resched();
   2667		}
   2668		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
   2669
   2670		kvfree(area->pages);
   2671	}
   2672
   2673	kfree(area);
   2674}
   2675
   2676static inline void __vfree_deferred(const void *addr)
   2677{
   2678	/*
   2679	 * Use raw_cpu_ptr() because this can be called from preemptible
   2680	 * context. Preemption is absolutely fine here, because the llist_add()
   2681	 * implementation is lockless, so it works even if we are adding to
   2682	 * another cpu's list. schedule_work() should be fine with this too.
   2683	 */
   2684	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
   2685
   2686	if (llist_add((struct llist_node *)addr, &p->list))
   2687		schedule_work(&p->wq);
   2688}
   2689
   2690/**
   2691 * vfree_atomic - release memory allocated by vmalloc()
   2692 * @addr:	  memory base address
   2693 *
   2694 * This one is just like vfree() but can be called in any atomic context
   2695 * except NMIs.
   2696 */
   2697void vfree_atomic(const void *addr)
   2698{
   2699	BUG_ON(in_nmi());
   2700
   2701	kmemleak_free(addr);
   2702
   2703	if (!addr)
   2704		return;
   2705	__vfree_deferred(addr);
   2706}
   2707
   2708static void __vfree(const void *addr)
   2709{
   2710	if (unlikely(in_interrupt()))
   2711		__vfree_deferred(addr);
   2712	else
   2713		__vunmap(addr, 1);
   2714}
   2715
   2716/**
   2717 * vfree - Release memory allocated by vmalloc()
   2718 * @addr:  Memory base address
   2719 *
   2720 * Free the virtually continuous memory area starting at @addr, as obtained
   2721 * from one of the vmalloc() family of APIs.  This will usually also free the
   2722 * physical memory underlying the virtual allocation, but that memory is
   2723 * reference counted, so it will not be freed until the last user goes away.
   2724 *
   2725 * If @addr is NULL, no operation is performed.
   2726 *
   2727 * Context:
   2728 * May sleep if called *not* from interrupt context.
   2729 * Must not be called in NMI context (strictly speaking, it could be
   2730 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
   2731 * conventions for vfree() arch-dependent would be a really bad idea).
   2732 */
   2733void vfree(const void *addr)
   2734{
   2735	BUG_ON(in_nmi());
   2736
   2737	kmemleak_free(addr);
   2738
   2739	might_sleep_if(!in_interrupt());
   2740
   2741	if (!addr)
   2742		return;
   2743
   2744	__vfree(addr);
   2745}
   2746EXPORT_SYMBOL(vfree);
   2747
   2748/**
   2749 * vunmap - release virtual mapping obtained by vmap()
   2750 * @addr:   memory base address
   2751 *
   2752 * Free the virtually contiguous memory area starting at @addr,
   2753 * which was created from the page array passed to vmap().
   2754 *
   2755 * Must not be called in interrupt context.
   2756 */
   2757void vunmap(const void *addr)
   2758{
   2759	BUG_ON(in_interrupt());
   2760	might_sleep();
   2761	if (addr)
   2762		__vunmap(addr, 0);
   2763}
   2764EXPORT_SYMBOL(vunmap);
   2765
   2766/**
   2767 * vmap - map an array of pages into virtually contiguous space
   2768 * @pages: array of page pointers
   2769 * @count: number of pages to map
   2770 * @flags: vm_area->flags
   2771 * @prot: page protection for the mapping
   2772 *
   2773 * Maps @count pages from @pages into contiguous kernel virtual space.
   2774 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
   2775 * (which must be kmalloc or vmalloc memory) and one reference per pages in it
   2776 * are transferred from the caller to vmap(), and will be freed / dropped when
   2777 * vfree() is called on the return value.
   2778 *
   2779 * Return: the address of the area or %NULL on failure
   2780 */
   2781void *vmap(struct page **pages, unsigned int count,
   2782	   unsigned long flags, pgprot_t prot)
   2783{
   2784	struct vm_struct *area;
   2785	unsigned long addr;
   2786	unsigned long size;		/* In bytes */
   2787
   2788	might_sleep();
   2789
   2790	/*
   2791	 * Your top guard is someone else's bottom guard. Not having a top
   2792	 * guard compromises someone else's mappings too.
   2793	 */
   2794	if (WARN_ON_ONCE(flags & VM_NO_GUARD))
   2795		flags &= ~VM_NO_GUARD;
   2796
   2797	if (count > totalram_pages())
   2798		return NULL;
   2799
   2800	size = (unsigned long)count << PAGE_SHIFT;
   2801	area = get_vm_area_caller(size, flags, __builtin_return_address(0));
   2802	if (!area)
   2803		return NULL;
   2804
   2805	addr = (unsigned long)area->addr;
   2806	if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
   2807				pages, PAGE_SHIFT) < 0) {
   2808		vunmap(area->addr);
   2809		return NULL;
   2810	}
   2811
   2812	if (flags & VM_MAP_PUT_PAGES) {
   2813		area->pages = pages;
   2814		area->nr_pages = count;
   2815	}
   2816	return area->addr;
   2817}
   2818EXPORT_SYMBOL(vmap);
   2819
   2820#ifdef CONFIG_VMAP_PFN
   2821struct vmap_pfn_data {
   2822	unsigned long	*pfns;
   2823	pgprot_t	prot;
   2824	unsigned int	idx;
   2825};
   2826
   2827static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
   2828{
   2829	struct vmap_pfn_data *data = private;
   2830
   2831	if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
   2832		return -EINVAL;
   2833	*pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
   2834	return 0;
   2835}
   2836
   2837/**
   2838 * vmap_pfn - map an array of PFNs into virtually contiguous space
   2839 * @pfns: array of PFNs
   2840 * @count: number of pages to map
   2841 * @prot: page protection for the mapping
   2842 *
   2843 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
   2844 * the start address of the mapping.
   2845 */
   2846void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
   2847{
   2848	struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
   2849	struct vm_struct *area;
   2850
   2851	area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
   2852			__builtin_return_address(0));
   2853	if (!area)
   2854		return NULL;
   2855	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
   2856			count * PAGE_SIZE, vmap_pfn_apply, &data)) {
   2857		free_vm_area(area);
   2858		return NULL;
   2859	}
   2860	return area->addr;
   2861}
   2862EXPORT_SYMBOL_GPL(vmap_pfn);
   2863#endif /* CONFIG_VMAP_PFN */
   2864
   2865static inline unsigned int
   2866vm_area_alloc_pages(gfp_t gfp, int nid,
   2867		unsigned int order, unsigned int nr_pages, struct page **pages)
   2868{
   2869	unsigned int nr_allocated = 0;
   2870	struct page *page;
   2871	int i;
   2872
   2873	/*
   2874	 * For order-0 pages we make use of bulk allocator, if
   2875	 * the page array is partly or not at all populated due
   2876	 * to fails, fallback to a single page allocator that is
   2877	 * more permissive.
   2878	 */
   2879	if (!order) {
   2880		gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
   2881
   2882		while (nr_allocated < nr_pages) {
   2883			unsigned int nr, nr_pages_request;
   2884
   2885			/*
   2886			 * A maximum allowed request is hard-coded and is 100
   2887			 * pages per call. That is done in order to prevent a
   2888			 * long preemption off scenario in the bulk-allocator
   2889			 * so the range is [1:100].
   2890			 */
   2891			nr_pages_request = min(100U, nr_pages - nr_allocated);
   2892
   2893			/* memory allocation should consider mempolicy, we can't
   2894			 * wrongly use nearest node when nid == NUMA_NO_NODE,
   2895			 * otherwise memory may be allocated in only one node,
   2896			 * but mempolicy wants to alloc memory by interleaving.
   2897			 */
   2898			if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
   2899				nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
   2900							nr_pages_request,
   2901							pages + nr_allocated);
   2902
   2903			else
   2904				nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
   2905							nr_pages_request,
   2906							pages + nr_allocated);
   2907
   2908			nr_allocated += nr;
   2909			cond_resched();
   2910
   2911			/*
   2912			 * If zero or pages were obtained partly,
   2913			 * fallback to a single page allocator.
   2914			 */
   2915			if (nr != nr_pages_request)
   2916				break;
   2917		}
   2918	}
   2919
   2920	/* High-order pages or fallback path if "bulk" fails. */
   2921
   2922	while (nr_allocated < nr_pages) {
   2923		if (fatal_signal_pending(current))
   2924			break;
   2925
   2926		if (nid == NUMA_NO_NODE)
   2927			page = alloc_pages(gfp, order);
   2928		else
   2929			page = alloc_pages_node(nid, gfp, order);
   2930		if (unlikely(!page))
   2931			break;
   2932		/*
   2933		 * Higher order allocations must be able to be treated as
   2934		 * indepdenent small pages by callers (as they can with
   2935		 * small-page vmallocs). Some drivers do their own refcounting
   2936		 * on vmalloc_to_page() pages, some use page->mapping,
   2937		 * page->lru, etc.
   2938		 */
   2939		if (order)
   2940			split_page(page, order);
   2941
   2942		/*
   2943		 * Careful, we allocate and map page-order pages, but
   2944		 * tracking is done per PAGE_SIZE page so as to keep the
   2945		 * vm_struct APIs independent of the physical/mapped size.
   2946		 */
   2947		for (i = 0; i < (1U << order); i++)
   2948			pages[nr_allocated + i] = page + i;
   2949
   2950		cond_resched();
   2951		nr_allocated += 1U << order;
   2952	}
   2953
   2954	return nr_allocated;
   2955}
   2956
   2957static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
   2958				 pgprot_t prot, unsigned int page_shift,
   2959				 int node)
   2960{
   2961	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
   2962	bool nofail = gfp_mask & __GFP_NOFAIL;
   2963	unsigned long addr = (unsigned long)area->addr;
   2964	unsigned long size = get_vm_area_size(area);
   2965	unsigned long array_size;
   2966	unsigned int nr_small_pages = size >> PAGE_SHIFT;
   2967	unsigned int page_order;
   2968	unsigned int flags;
   2969	int ret;
   2970
   2971	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
   2972	gfp_mask |= __GFP_NOWARN;
   2973	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
   2974		gfp_mask |= __GFP_HIGHMEM;
   2975
   2976	/* Please note that the recursion is strictly bounded. */
   2977	if (array_size > PAGE_SIZE) {
   2978		area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
   2979					area->caller);
   2980	} else {
   2981		area->pages = kmalloc_node(array_size, nested_gfp, node);
   2982	}
   2983
   2984	if (!area->pages) {
   2985		warn_alloc(gfp_mask, NULL,
   2986			"vmalloc error: size %lu, failed to allocated page array size %lu",
   2987			nr_small_pages * PAGE_SIZE, array_size);
   2988		free_vm_area(area);
   2989		return NULL;
   2990	}
   2991
   2992	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
   2993	page_order = vm_area_page_order(area);
   2994
   2995	area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
   2996		node, page_order, nr_small_pages, area->pages);
   2997
   2998	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
   2999	if (gfp_mask & __GFP_ACCOUNT) {
   3000		int i;
   3001
   3002		for (i = 0; i < area->nr_pages; i++)
   3003			mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
   3004	}
   3005
   3006	/*
   3007	 * If not enough pages were obtained to accomplish an
   3008	 * allocation request, free them via __vfree() if any.
   3009	 */
   3010	if (area->nr_pages != nr_small_pages) {
   3011		warn_alloc(gfp_mask, NULL,
   3012			"vmalloc error: size %lu, page order %u, failed to allocate pages",
   3013			area->nr_pages * PAGE_SIZE, page_order);
   3014		goto fail;
   3015	}
   3016
   3017	/*
   3018	 * page tables allocations ignore external gfp mask, enforce it
   3019	 * by the scope API
   3020	 */
   3021	if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
   3022		flags = memalloc_nofs_save();
   3023	else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
   3024		flags = memalloc_noio_save();
   3025
   3026	do {
   3027		ret = vmap_pages_range(addr, addr + size, prot, area->pages,
   3028			page_shift);
   3029		if (nofail && (ret < 0))
   3030			schedule_timeout_uninterruptible(1);
   3031	} while (nofail && (ret < 0));
   3032
   3033	if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
   3034		memalloc_nofs_restore(flags);
   3035	else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
   3036		memalloc_noio_restore(flags);
   3037
   3038	if (ret < 0) {
   3039		warn_alloc(gfp_mask, NULL,
   3040			"vmalloc error: size %lu, failed to map pages",
   3041			area->nr_pages * PAGE_SIZE);
   3042		goto fail;
   3043	}
   3044
   3045	return area->addr;
   3046
   3047fail:
   3048	__vfree(area->addr);
   3049	return NULL;
   3050}
   3051
   3052/**
   3053 * __vmalloc_node_range - allocate virtually contiguous memory
   3054 * @size:		  allocation size
   3055 * @align:		  desired alignment
   3056 * @start:		  vm area range start
   3057 * @end:		  vm area range end
   3058 * @gfp_mask:		  flags for the page level allocator
   3059 * @prot:		  protection mask for the allocated pages
   3060 * @vm_flags:		  additional vm area flags (e.g. %VM_NO_GUARD)
   3061 * @node:		  node to use for allocation or NUMA_NO_NODE
   3062 * @caller:		  caller's return address
   3063 *
   3064 * Allocate enough pages to cover @size from the page level
   3065 * allocator with @gfp_mask flags. Please note that the full set of gfp
   3066 * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
   3067 * supported.
   3068 * Zone modifiers are not supported. From the reclaim modifiers
   3069 * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
   3070 * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
   3071 * __GFP_RETRY_MAYFAIL are not supported).
   3072 *
   3073 * __GFP_NOWARN can be used to suppress failures messages.
   3074 *
   3075 * Map them into contiguous kernel virtual space, using a pagetable
   3076 * protection of @prot.
   3077 *
   3078 * Return: the address of the area or %NULL on failure
   3079 */
   3080void *__vmalloc_node_range(unsigned long size, unsigned long align,
   3081			unsigned long start, unsigned long end, gfp_t gfp_mask,
   3082			pgprot_t prot, unsigned long vm_flags, int node,
   3083			const void *caller)
   3084{
   3085	struct vm_struct *area;
   3086	void *ret;
   3087	kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
   3088	unsigned long real_size = size;
   3089	unsigned long real_align = align;
   3090	unsigned int shift = PAGE_SHIFT;
   3091
   3092	if (WARN_ON_ONCE(!size))
   3093		return NULL;
   3094
   3095	if ((size >> PAGE_SHIFT) > totalram_pages()) {
   3096		warn_alloc(gfp_mask, NULL,
   3097			"vmalloc error: size %lu, exceeds total pages",
   3098			real_size);
   3099		return NULL;
   3100	}
   3101
   3102	if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
   3103		unsigned long size_per_node;
   3104
   3105		/*
   3106		 * Try huge pages. Only try for PAGE_KERNEL allocations,
   3107		 * others like modules don't yet expect huge pages in
   3108		 * their allocations due to apply_to_page_range not
   3109		 * supporting them.
   3110		 */
   3111
   3112		size_per_node = size;
   3113		if (node == NUMA_NO_NODE)
   3114			size_per_node /= num_online_nodes();
   3115		if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
   3116			shift = PMD_SHIFT;
   3117		else
   3118			shift = arch_vmap_pte_supported_shift(size_per_node);
   3119
   3120		align = max(real_align, 1UL << shift);
   3121		size = ALIGN(real_size, 1UL << shift);
   3122	}
   3123
   3124again:
   3125	area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
   3126				  VM_UNINITIALIZED | vm_flags, start, end, node,
   3127				  gfp_mask, caller);
   3128	if (!area) {
   3129		bool nofail = gfp_mask & __GFP_NOFAIL;
   3130		warn_alloc(gfp_mask, NULL,
   3131			"vmalloc error: size %lu, vm_struct allocation failed%s",
   3132			real_size, (nofail) ? ". Retrying." : "");
   3133		if (nofail) {
   3134			schedule_timeout_uninterruptible(1);
   3135			goto again;
   3136		}
   3137		goto fail;
   3138	}
   3139
   3140	/*
   3141	 * Prepare arguments for __vmalloc_area_node() and
   3142	 * kasan_unpoison_vmalloc().
   3143	 */
   3144	if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
   3145		if (kasan_hw_tags_enabled()) {
   3146			/*
   3147			 * Modify protection bits to allow tagging.
   3148			 * This must be done before mapping.
   3149			 */
   3150			prot = arch_vmap_pgprot_tagged(prot);
   3151
   3152			/*
   3153			 * Skip page_alloc poisoning and zeroing for physical
   3154			 * pages backing VM_ALLOC mapping. Memory is instead
   3155			 * poisoned and zeroed by kasan_unpoison_vmalloc().
   3156			 */
   3157			gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
   3158		}
   3159
   3160		/* Take note that the mapping is PAGE_KERNEL. */
   3161		kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
   3162	}
   3163
   3164	/* Allocate physical pages and map them into vmalloc space. */
   3165	ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
   3166	if (!ret)
   3167		goto fail;
   3168
   3169	/*
   3170	 * Mark the pages as accessible, now that they are mapped.
   3171	 * The init condition should match the one in post_alloc_hook()
   3172	 * (except for the should_skip_init() check) to make sure that memory
   3173	 * is initialized under the same conditions regardless of the enabled
   3174	 * KASAN mode.
   3175	 * Tag-based KASAN modes only assign tags to normal non-executable
   3176	 * allocations, see __kasan_unpoison_vmalloc().
   3177	 */
   3178	kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
   3179	if (!want_init_on_free() && want_init_on_alloc(gfp_mask))
   3180		kasan_flags |= KASAN_VMALLOC_INIT;
   3181	/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
   3182	area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
   3183
   3184	/*
   3185	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
   3186	 * flag. It means that vm_struct is not fully initialized.
   3187	 * Now, it is fully initialized, so remove this flag here.
   3188	 */
   3189	clear_vm_uninitialized_flag(area);
   3190
   3191	size = PAGE_ALIGN(size);
   3192	if (!(vm_flags & VM_DEFER_KMEMLEAK))
   3193		kmemleak_vmalloc(area, size, gfp_mask);
   3194
   3195	return area->addr;
   3196
   3197fail:
   3198	if (shift > PAGE_SHIFT) {
   3199		shift = PAGE_SHIFT;
   3200		align = real_align;
   3201		size = real_size;
   3202		goto again;
   3203	}
   3204
   3205	return NULL;
   3206}
   3207
   3208/**
   3209 * __vmalloc_node - allocate virtually contiguous memory
   3210 * @size:	    allocation size
   3211 * @align:	    desired alignment
   3212 * @gfp_mask:	    flags for the page level allocator
   3213 * @node:	    node to use for allocation or NUMA_NO_NODE
   3214 * @caller:	    caller's return address
   3215 *
   3216 * Allocate enough pages to cover @size from the page level allocator with
   3217 * @gfp_mask flags.  Map them into contiguous kernel virtual space.
   3218 *
   3219 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
   3220 * and __GFP_NOFAIL are not supported
   3221 *
   3222 * Any use of gfp flags outside of GFP_KERNEL should be consulted
   3223 * with mm people.
   3224 *
   3225 * Return: pointer to the allocated memory or %NULL on error
   3226 */
   3227void *__vmalloc_node(unsigned long size, unsigned long align,
   3228			    gfp_t gfp_mask, int node, const void *caller)
   3229{
   3230	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
   3231				gfp_mask, PAGE_KERNEL, 0, node, caller);
   3232}
   3233/*
   3234 * This is only for performance analysis of vmalloc and stress purpose.
   3235 * It is required by vmalloc test module, therefore do not use it other
   3236 * than that.
   3237 */
   3238#ifdef CONFIG_TEST_VMALLOC_MODULE
   3239EXPORT_SYMBOL_GPL(__vmalloc_node);
   3240#endif
   3241
   3242void *__vmalloc(unsigned long size, gfp_t gfp_mask)
   3243{
   3244	return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
   3245				__builtin_return_address(0));
   3246}
   3247EXPORT_SYMBOL(__vmalloc);
   3248
   3249/**
   3250 * vmalloc - allocate virtually contiguous memory
   3251 * @size:    allocation size
   3252 *
   3253 * Allocate enough pages to cover @size from the page level
   3254 * allocator and map them into contiguous kernel virtual space.
   3255 *
   3256 * For tight control over page level allocator and protection flags
   3257 * use __vmalloc() instead.
   3258 *
   3259 * Return: pointer to the allocated memory or %NULL on error
   3260 */
   3261void *vmalloc(unsigned long size)
   3262{
   3263	return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
   3264				__builtin_return_address(0));
   3265}
   3266EXPORT_SYMBOL(vmalloc);
   3267
   3268/**
   3269 * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
   3270 * @size:      allocation size
   3271 * @gfp_mask:  flags for the page level allocator
   3272 *
   3273 * Allocate enough pages to cover @size from the page level
   3274 * allocator and map them into contiguous kernel virtual space.
   3275 * If @size is greater than or equal to PMD_SIZE, allow using
   3276 * huge pages for the memory
   3277 *
   3278 * Return: pointer to the allocated memory or %NULL on error
   3279 */
   3280void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
   3281{
   3282	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
   3283				    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
   3284				    NUMA_NO_NODE, __builtin_return_address(0));
   3285}
   3286EXPORT_SYMBOL_GPL(vmalloc_huge);
   3287
   3288/**
   3289 * vzalloc - allocate virtually contiguous memory with zero fill
   3290 * @size:    allocation size
   3291 *
   3292 * Allocate enough pages to cover @size from the page level
   3293 * allocator and map them into contiguous kernel virtual space.
   3294 * The memory allocated is set to zero.
   3295 *
   3296 * For tight control over page level allocator and protection flags
   3297 * use __vmalloc() instead.
   3298 *
   3299 * Return: pointer to the allocated memory or %NULL on error
   3300 */
   3301void *vzalloc(unsigned long size)
   3302{
   3303	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
   3304				__builtin_return_address(0));
   3305}
   3306EXPORT_SYMBOL(vzalloc);
   3307
   3308/**
   3309 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
   3310 * @size: allocation size
   3311 *
   3312 * The resulting memory area is zeroed so it can be mapped to userspace
   3313 * without leaking data.
   3314 *
   3315 * Return: pointer to the allocated memory or %NULL on error
   3316 */
   3317void *vmalloc_user(unsigned long size)
   3318{
   3319	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
   3320				    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
   3321				    VM_USERMAP, NUMA_NO_NODE,
   3322				    __builtin_return_address(0));
   3323}
   3324EXPORT_SYMBOL(vmalloc_user);
   3325
   3326/**
   3327 * vmalloc_node - allocate memory on a specific node
   3328 * @size:	  allocation size
   3329 * @node:	  numa node
   3330 *
   3331 * Allocate enough pages to cover @size from the page level
   3332 * allocator and map them into contiguous kernel virtual space.
   3333 *
   3334 * For tight control over page level allocator and protection flags
   3335 * use __vmalloc() instead.
   3336 *
   3337 * Return: pointer to the allocated memory or %NULL on error
   3338 */
   3339void *vmalloc_node(unsigned long size, int node)
   3340{
   3341	return __vmalloc_node(size, 1, GFP_KERNEL, node,
   3342			__builtin_return_address(0));
   3343}
   3344EXPORT_SYMBOL(vmalloc_node);
   3345
   3346/**
   3347 * vzalloc_node - allocate memory on a specific node with zero fill
   3348 * @size:	allocation size
   3349 * @node:	numa node
   3350 *
   3351 * Allocate enough pages to cover @size from the page level
   3352 * allocator and map them into contiguous kernel virtual space.
   3353 * The memory allocated is set to zero.
   3354 *
   3355 * Return: pointer to the allocated memory or %NULL on error
   3356 */
   3357void *vzalloc_node(unsigned long size, int node)
   3358{
   3359	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
   3360				__builtin_return_address(0));
   3361}
   3362EXPORT_SYMBOL(vzalloc_node);
   3363
   3364#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
   3365#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
   3366#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
   3367#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
   3368#else
   3369/*
   3370 * 64b systems should always have either DMA or DMA32 zones. For others
   3371 * GFP_DMA32 should do the right thing and use the normal zone.
   3372 */
   3373#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
   3374#endif
   3375
   3376/**
   3377 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
   3378 * @size:	allocation size
   3379 *
   3380 * Allocate enough 32bit PA addressable pages to cover @size from the
   3381 * page level allocator and map them into contiguous kernel virtual space.
   3382 *
   3383 * Return: pointer to the allocated memory or %NULL on error
   3384 */
   3385void *vmalloc_32(unsigned long size)
   3386{
   3387	return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
   3388			__builtin_return_address(0));
   3389}
   3390EXPORT_SYMBOL(vmalloc_32);
   3391
   3392/**
   3393 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
   3394 * @size:	     allocation size
   3395 *
   3396 * The resulting memory area is 32bit addressable and zeroed so it can be
   3397 * mapped to userspace without leaking data.
   3398 *
   3399 * Return: pointer to the allocated memory or %NULL on error
   3400 */
   3401void *vmalloc_32_user(unsigned long size)
   3402{
   3403	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
   3404				    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
   3405				    VM_USERMAP, NUMA_NO_NODE,
   3406				    __builtin_return_address(0));
   3407}
   3408EXPORT_SYMBOL(vmalloc_32_user);
   3409
   3410/*
   3411 * small helper routine , copy contents to buf from addr.
   3412 * If the page is not present, fill zero.
   3413 */
   3414
   3415static int aligned_vread(char *buf, char *addr, unsigned long count)
   3416{
   3417	struct page *p;
   3418	int copied = 0;
   3419
   3420	while (count) {
   3421		unsigned long offset, length;
   3422
   3423		offset = offset_in_page(addr);
   3424		length = PAGE_SIZE - offset;
   3425		if (length > count)
   3426			length = count;
   3427		p = vmalloc_to_page(addr);
   3428		/*
   3429		 * To do safe access to this _mapped_ area, we need
   3430		 * lock. But adding lock here means that we need to add
   3431		 * overhead of vmalloc()/vfree() calls for this _debug_
   3432		 * interface, rarely used. Instead of that, we'll use
   3433		 * kmap() and get small overhead in this access function.
   3434		 */
   3435		if (p) {
   3436			/* We can expect USER0 is not used -- see vread() */
   3437			void *map = kmap_atomic(p);
   3438			memcpy(buf, map + offset, length);
   3439			kunmap_atomic(map);
   3440		} else
   3441			memset(buf, 0, length);
   3442
   3443		addr += length;
   3444		buf += length;
   3445		copied += length;
   3446		count -= length;
   3447	}
   3448	return copied;
   3449}
   3450
   3451/**
   3452 * vread() - read vmalloc area in a safe way.
   3453 * @buf:     buffer for reading data
   3454 * @addr:    vm address.
   3455 * @count:   number of bytes to be read.
   3456 *
   3457 * This function checks that addr is a valid vmalloc'ed area, and
   3458 * copy data from that area to a given buffer. If the given memory range
   3459 * of [addr...addr+count) includes some valid address, data is copied to
   3460 * proper area of @buf. If there are memory holes, they'll be zero-filled.
   3461 * IOREMAP area is treated as memory hole and no copy is done.
   3462 *
   3463 * If [addr...addr+count) doesn't includes any intersects with alive
   3464 * vm_struct area, returns 0. @buf should be kernel's buffer.
   3465 *
   3466 * Note: In usual ops, vread() is never necessary because the caller
   3467 * should know vmalloc() area is valid and can use memcpy().
   3468 * This is for routines which have to access vmalloc area without
   3469 * any information, as /proc/kcore.
   3470 *
   3471 * Return: number of bytes for which addr and buf should be increased
   3472 * (same number as @count) or %0 if [addr...addr+count) doesn't
   3473 * include any intersection with valid vmalloc area
   3474 */
   3475long vread(char *buf, char *addr, unsigned long count)
   3476{
   3477	struct vmap_area *va;
   3478	struct vm_struct *vm;
   3479	char *vaddr, *buf_start = buf;
   3480	unsigned long buflen = count;
   3481	unsigned long n;
   3482
   3483	addr = kasan_reset_tag(addr);
   3484
   3485	/* Don't allow overflow */
   3486	if ((unsigned long) addr + count < count)
   3487		count = -(unsigned long) addr;
   3488
   3489	spin_lock(&vmap_area_lock);
   3490	va = find_vmap_area_exceed_addr((unsigned long)addr);
   3491	if (!va)
   3492		goto finished;
   3493
   3494	/* no intersects with alive vmap_area */
   3495	if ((unsigned long)addr + count <= va->va_start)
   3496		goto finished;
   3497
   3498	list_for_each_entry_from(va, &vmap_area_list, list) {
   3499		if (!count)
   3500			break;
   3501
   3502		if (!va->vm)
   3503			continue;
   3504
   3505		vm = va->vm;
   3506		vaddr = (char *) vm->addr;
   3507		if (addr >= vaddr + get_vm_area_size(vm))
   3508			continue;
   3509		while (addr < vaddr) {
   3510			if (count == 0)
   3511				goto finished;
   3512			*buf = '\0';
   3513			buf++;
   3514			addr++;
   3515			count--;
   3516		}
   3517		n = vaddr + get_vm_area_size(vm) - addr;
   3518		if (n > count)
   3519			n = count;
   3520		if (!(vm->flags & VM_IOREMAP))
   3521			aligned_vread(buf, addr, n);
   3522		else /* IOREMAP area is treated as memory hole */
   3523			memset(buf, 0, n);
   3524		buf += n;
   3525		addr += n;
   3526		count -= n;
   3527	}
   3528finished:
   3529	spin_unlock(&vmap_area_lock);
   3530
   3531	if (buf == buf_start)
   3532		return 0;
   3533	/* zero-fill memory holes */
   3534	if (buf != buf_start + buflen)
   3535		memset(buf, 0, buflen - (buf - buf_start));
   3536
   3537	return buflen;
   3538}
   3539
   3540/**
   3541 * remap_vmalloc_range_partial - map vmalloc pages to userspace
   3542 * @vma:		vma to cover
   3543 * @uaddr:		target user address to start at
   3544 * @kaddr:		virtual address of vmalloc kernel memory
   3545 * @pgoff:		offset from @kaddr to start at
   3546 * @size:		size of map area
   3547 *
   3548 * Returns:	0 for success, -Exxx on failure
   3549 *
   3550 * This function checks that @kaddr is a valid vmalloc'ed area,
   3551 * and that it is big enough to cover the range starting at
   3552 * @uaddr in @vma. Will return failure if that criteria isn't
   3553 * met.
   3554 *
   3555 * Similar to remap_pfn_range() (see mm/memory.c)
   3556 */
   3557int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
   3558				void *kaddr, unsigned long pgoff,
   3559				unsigned long size)
   3560{
   3561	struct vm_struct *area;
   3562	unsigned long off;
   3563	unsigned long end_index;
   3564
   3565	if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
   3566		return -EINVAL;
   3567
   3568	size = PAGE_ALIGN(size);
   3569
   3570	if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
   3571		return -EINVAL;
   3572
   3573	area = find_vm_area(kaddr);
   3574	if (!area)
   3575		return -EINVAL;
   3576
   3577	if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
   3578		return -EINVAL;
   3579
   3580	if (check_add_overflow(size, off, &end_index) ||
   3581	    end_index > get_vm_area_size(area))
   3582		return -EINVAL;
   3583	kaddr += off;
   3584
   3585	do {
   3586		struct page *page = vmalloc_to_page(kaddr);
   3587		int ret;
   3588
   3589		ret = vm_insert_page(vma, uaddr, page);
   3590		if (ret)
   3591			return ret;
   3592
   3593		uaddr += PAGE_SIZE;
   3594		kaddr += PAGE_SIZE;
   3595		size -= PAGE_SIZE;
   3596	} while (size > 0);
   3597
   3598	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
   3599
   3600	return 0;
   3601}
   3602
   3603/**
   3604 * remap_vmalloc_range - map vmalloc pages to userspace
   3605 * @vma:		vma to cover (map full range of vma)
   3606 * @addr:		vmalloc memory
   3607 * @pgoff:		number of pages into addr before first page to map
   3608 *
   3609 * Returns:	0 for success, -Exxx on failure
   3610 *
   3611 * This function checks that addr is a valid vmalloc'ed area, and
   3612 * that it is big enough to cover the vma. Will return failure if
   3613 * that criteria isn't met.
   3614 *
   3615 * Similar to remap_pfn_range() (see mm/memory.c)
   3616 */
   3617int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
   3618						unsigned long pgoff)
   3619{
   3620	return remap_vmalloc_range_partial(vma, vma->vm_start,
   3621					   addr, pgoff,
   3622					   vma->vm_end - vma->vm_start);
   3623}
   3624EXPORT_SYMBOL(remap_vmalloc_range);
   3625
   3626void free_vm_area(struct vm_struct *area)
   3627{
   3628	struct vm_struct *ret;
   3629	ret = remove_vm_area(area->addr);
   3630	BUG_ON(ret != area);
   3631	kfree(area);
   3632}
   3633EXPORT_SYMBOL_GPL(free_vm_area);
   3634
   3635#ifdef CONFIG_SMP
   3636static struct vmap_area *node_to_va(struct rb_node *n)
   3637{
   3638	return rb_entry_safe(n, struct vmap_area, rb_node);
   3639}
   3640
   3641/**
   3642 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
   3643 * @addr: target address
   3644 *
   3645 * Returns: vmap_area if it is found. If there is no such area
   3646 *   the first highest(reverse order) vmap_area is returned
   3647 *   i.e. va->va_start < addr && va->va_end < addr or NULL
   3648 *   if there are no any areas before @addr.
   3649 */
   3650static struct vmap_area *
   3651pvm_find_va_enclose_addr(unsigned long addr)
   3652{
   3653	struct vmap_area *va, *tmp;
   3654	struct rb_node *n;
   3655
   3656	n = free_vmap_area_root.rb_node;
   3657	va = NULL;
   3658
   3659	while (n) {
   3660		tmp = rb_entry(n, struct vmap_area, rb_node);
   3661		if (tmp->va_start <= addr) {
   3662			va = tmp;
   3663			if (tmp->va_end >= addr)
   3664				break;
   3665
   3666			n = n->rb_right;
   3667		} else {
   3668			n = n->rb_left;
   3669		}
   3670	}
   3671
   3672	return va;
   3673}
   3674
   3675/**
   3676 * pvm_determine_end_from_reverse - find the highest aligned address
   3677 * of free block below VMALLOC_END
   3678 * @va:
   3679 *   in - the VA we start the search(reverse order);
   3680 *   out - the VA with the highest aligned end address.
   3681 * @align: alignment for required highest address
   3682 *
   3683 * Returns: determined end address within vmap_area
   3684 */
   3685static unsigned long
   3686pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
   3687{
   3688	unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
   3689	unsigned long addr;
   3690
   3691	if (likely(*va)) {
   3692		list_for_each_entry_from_reverse((*va),
   3693				&free_vmap_area_list, list) {
   3694			addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
   3695			if ((*va)->va_start < addr)
   3696				return addr;
   3697		}
   3698	}
   3699
   3700	return 0;
   3701}
   3702
   3703/**
   3704 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
   3705 * @offsets: array containing offset of each area
   3706 * @sizes: array containing size of each area
   3707 * @nr_vms: the number of areas to allocate
   3708 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
   3709 *
   3710 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
   3711 *	    vm_structs on success, %NULL on failure
   3712 *
   3713 * Percpu allocator wants to use congruent vm areas so that it can
   3714 * maintain the offsets among percpu areas.  This function allocates
   3715 * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
   3716 * be scattered pretty far, distance between two areas easily going up
   3717 * to gigabytes.  To avoid interacting with regular vmallocs, these
   3718 * areas are allocated from top.
   3719 *
   3720 * Despite its complicated look, this allocator is rather simple. It
   3721 * does everything top-down and scans free blocks from the end looking
   3722 * for matching base. While scanning, if any of the areas do not fit the
   3723 * base address is pulled down to fit the area. Scanning is repeated till
   3724 * all the areas fit and then all necessary data structures are inserted
   3725 * and the result is returned.
   3726 */
   3727struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
   3728				     const size_t *sizes, int nr_vms,
   3729				     size_t align)
   3730{
   3731	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
   3732	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
   3733	struct vmap_area **vas, *va;
   3734	struct vm_struct **vms;
   3735	int area, area2, last_area, term_area;
   3736	unsigned long base, start, size, end, last_end, orig_start, orig_end;
   3737	bool purged = false;
   3738	enum fit_type type;
   3739
   3740	/* verify parameters and allocate data structures */
   3741	BUG_ON(offset_in_page(align) || !is_power_of_2(align));
   3742	for (last_area = 0, area = 0; area < nr_vms; area++) {
   3743		start = offsets[area];
   3744		end = start + sizes[area];
   3745
   3746		/* is everything aligned properly? */
   3747		BUG_ON(!IS_ALIGNED(offsets[area], align));
   3748		BUG_ON(!IS_ALIGNED(sizes[area], align));
   3749
   3750		/* detect the area with the highest address */
   3751		if (start > offsets[last_area])
   3752			last_area = area;
   3753
   3754		for (area2 = area + 1; area2 < nr_vms; area2++) {
   3755			unsigned long start2 = offsets[area2];
   3756			unsigned long end2 = start2 + sizes[area2];
   3757
   3758			BUG_ON(start2 < end && start < end2);
   3759		}
   3760	}
   3761	last_end = offsets[last_area] + sizes[last_area];
   3762
   3763	if (vmalloc_end - vmalloc_start < last_end) {
   3764		WARN_ON(true);
   3765		return NULL;
   3766	}
   3767
   3768	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
   3769	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
   3770	if (!vas || !vms)
   3771		goto err_free2;
   3772
   3773	for (area = 0; area < nr_vms; area++) {
   3774		vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
   3775		vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
   3776		if (!vas[area] || !vms[area])
   3777			goto err_free;
   3778	}
   3779retry:
   3780	spin_lock(&free_vmap_area_lock);
   3781
   3782	/* start scanning - we scan from the top, begin with the last area */
   3783	area = term_area = last_area;
   3784	start = offsets[area];
   3785	end = start + sizes[area];
   3786
   3787	va = pvm_find_va_enclose_addr(vmalloc_end);
   3788	base = pvm_determine_end_from_reverse(&va, align) - end;
   3789
   3790	while (true) {
   3791		/*
   3792		 * base might have underflowed, add last_end before
   3793		 * comparing.
   3794		 */
   3795		if (base + last_end < vmalloc_start + last_end)
   3796			goto overflow;
   3797
   3798		/*
   3799		 * Fitting base has not been found.
   3800		 */
   3801		if (va == NULL)
   3802			goto overflow;
   3803
   3804		/*
   3805		 * If required width exceeds current VA block, move
   3806		 * base downwards and then recheck.
   3807		 */
   3808		if (base + end > va->va_end) {
   3809			base = pvm_determine_end_from_reverse(&va, align) - end;
   3810			term_area = area;
   3811			continue;
   3812		}
   3813
   3814		/*
   3815		 * If this VA does not fit, move base downwards and recheck.
   3816		 */
   3817		if (base + start < va->va_start) {
   3818			va = node_to_va(rb_prev(&va->rb_node));
   3819			base = pvm_determine_end_from_reverse(&va, align) - end;
   3820			term_area = area;
   3821			continue;
   3822		}
   3823
   3824		/*
   3825		 * This area fits, move on to the previous one.  If
   3826		 * the previous one is the terminal one, we're done.
   3827		 */
   3828		area = (area + nr_vms - 1) % nr_vms;
   3829		if (area == term_area)
   3830			break;
   3831
   3832		start = offsets[area];
   3833		end = start + sizes[area];
   3834		va = pvm_find_va_enclose_addr(base + end);
   3835	}
   3836
   3837	/* we've found a fitting base, insert all va's */
   3838	for (area = 0; area < nr_vms; area++) {
   3839		int ret;
   3840
   3841		start = base + offsets[area];
   3842		size = sizes[area];
   3843
   3844		va = pvm_find_va_enclose_addr(start);
   3845		if (WARN_ON_ONCE(va == NULL))
   3846			/* It is a BUG(), but trigger recovery instead. */
   3847			goto recovery;
   3848
   3849		type = classify_va_fit_type(va, start, size);
   3850		if (WARN_ON_ONCE(type == NOTHING_FIT))
   3851			/* It is a BUG(), but trigger recovery instead. */
   3852			goto recovery;
   3853
   3854		ret = adjust_va_to_fit_type(va, start, size, type);
   3855		if (unlikely(ret))
   3856			goto recovery;
   3857
   3858		/* Allocated area. */
   3859		va = vas[area];
   3860		va->va_start = start;
   3861		va->va_end = start + size;
   3862	}
   3863
   3864	spin_unlock(&free_vmap_area_lock);
   3865
   3866	/* populate the kasan shadow space */
   3867	for (area = 0; area < nr_vms; area++) {
   3868		if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
   3869			goto err_free_shadow;
   3870	}
   3871
   3872	/* insert all vm's */
   3873	spin_lock(&vmap_area_lock);
   3874	for (area = 0; area < nr_vms; area++) {
   3875		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
   3876
   3877		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
   3878				 pcpu_get_vm_areas);
   3879	}
   3880	spin_unlock(&vmap_area_lock);
   3881
   3882	/*
   3883	 * Mark allocated areas as accessible. Do it now as a best-effort
   3884	 * approach, as they can be mapped outside of vmalloc code.
   3885	 * With hardware tag-based KASAN, marking is skipped for
   3886	 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
   3887	 */
   3888	for (area = 0; area < nr_vms; area++)
   3889		vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
   3890				vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
   3891
   3892	kfree(vas);
   3893	return vms;
   3894
   3895recovery:
   3896	/*
   3897	 * Remove previously allocated areas. There is no
   3898	 * need in removing these areas from the busy tree,
   3899	 * because they are inserted only on the final step
   3900	 * and when pcpu_get_vm_areas() is success.
   3901	 */
   3902	while (area--) {
   3903		orig_start = vas[area]->va_start;
   3904		orig_end = vas[area]->va_end;
   3905		va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
   3906				&free_vmap_area_list);
   3907		if (va)
   3908			kasan_release_vmalloc(orig_start, orig_end,
   3909				va->va_start, va->va_end);
   3910		vas[area] = NULL;
   3911	}
   3912
   3913overflow:
   3914	spin_unlock(&free_vmap_area_lock);
   3915	if (!purged) {
   3916		purge_vmap_area_lazy();
   3917		purged = true;
   3918
   3919		/* Before "retry", check if we recover. */
   3920		for (area = 0; area < nr_vms; area++) {
   3921			if (vas[area])
   3922				continue;
   3923
   3924			vas[area] = kmem_cache_zalloc(
   3925				vmap_area_cachep, GFP_KERNEL);
   3926			if (!vas[area])
   3927				goto err_free;
   3928		}
   3929
   3930		goto retry;
   3931	}
   3932
   3933err_free:
   3934	for (area = 0; area < nr_vms; area++) {
   3935		if (vas[area])
   3936			kmem_cache_free(vmap_area_cachep, vas[area]);
   3937
   3938		kfree(vms[area]);
   3939	}
   3940err_free2:
   3941	kfree(vas);
   3942	kfree(vms);
   3943	return NULL;
   3944
   3945err_free_shadow:
   3946	spin_lock(&free_vmap_area_lock);
   3947	/*
   3948	 * We release all the vmalloc shadows, even the ones for regions that
   3949	 * hadn't been successfully added. This relies on kasan_release_vmalloc
   3950	 * being able to tolerate this case.
   3951	 */
   3952	for (area = 0; area < nr_vms; area++) {
   3953		orig_start = vas[area]->va_start;
   3954		orig_end = vas[area]->va_end;
   3955		va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
   3956				&free_vmap_area_list);
   3957		if (va)
   3958			kasan_release_vmalloc(orig_start, orig_end,
   3959				va->va_start, va->va_end);
   3960		vas[area] = NULL;
   3961		kfree(vms[area]);
   3962	}
   3963	spin_unlock(&free_vmap_area_lock);
   3964	kfree(vas);
   3965	kfree(vms);
   3966	return NULL;
   3967}
   3968
   3969/**
   3970 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
   3971 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
   3972 * @nr_vms: the number of allocated areas
   3973 *
   3974 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
   3975 */
   3976void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
   3977{
   3978	int i;
   3979
   3980	for (i = 0; i < nr_vms; i++)
   3981		free_vm_area(vms[i]);
   3982	kfree(vms);
   3983}
   3984#endif	/* CONFIG_SMP */
   3985
   3986#ifdef CONFIG_PRINTK
   3987bool vmalloc_dump_obj(void *object)
   3988{
   3989	struct vm_struct *vm;
   3990	void *objp = (void *)PAGE_ALIGN((unsigned long)object);
   3991
   3992	vm = find_vm_area(objp);
   3993	if (!vm)
   3994		return false;
   3995	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
   3996		vm->nr_pages, (unsigned long)vm->addr, vm->caller);
   3997	return true;
   3998}
   3999#endif
   4000
   4001#ifdef CONFIG_PROC_FS
   4002static void *s_start(struct seq_file *m, loff_t *pos)
   4003	__acquires(&vmap_purge_lock)
   4004	__acquires(&vmap_area_lock)
   4005{
   4006	mutex_lock(&vmap_purge_lock);
   4007	spin_lock(&vmap_area_lock);
   4008
   4009	return seq_list_start(&vmap_area_list, *pos);
   4010}
   4011
   4012static void *s_next(struct seq_file *m, void *p, loff_t *pos)
   4013{
   4014	return seq_list_next(p, &vmap_area_list, pos);
   4015}
   4016
   4017static void s_stop(struct seq_file *m, void *p)
   4018	__releases(&vmap_area_lock)
   4019	__releases(&vmap_purge_lock)
   4020{
   4021	spin_unlock(&vmap_area_lock);
   4022	mutex_unlock(&vmap_purge_lock);
   4023}
   4024
   4025static void show_numa_info(struct seq_file *m, struct vm_struct *v)
   4026{
   4027	if (IS_ENABLED(CONFIG_NUMA)) {
   4028		unsigned int nr, *counters = m->private;
   4029		unsigned int step = 1U << vm_area_page_order(v);
   4030
   4031		if (!counters)
   4032			return;
   4033
   4034		if (v->flags & VM_UNINITIALIZED)
   4035			return;
   4036		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
   4037		smp_rmb();
   4038
   4039		memset(counters, 0, nr_node_ids * sizeof(unsigned int));
   4040
   4041		for (nr = 0; nr < v->nr_pages; nr += step)
   4042			counters[page_to_nid(v->pages[nr])] += step;
   4043		for_each_node_state(nr, N_HIGH_MEMORY)
   4044			if (counters[nr])
   4045				seq_printf(m, " N%u=%u", nr, counters[nr]);
   4046	}
   4047}
   4048
   4049static void show_purge_info(struct seq_file *m)
   4050{
   4051	struct vmap_area *va;
   4052
   4053	spin_lock(&purge_vmap_area_lock);
   4054	list_for_each_entry(va, &purge_vmap_area_list, list) {
   4055		seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
   4056			(void *)va->va_start, (void *)va->va_end,
   4057			va->va_end - va->va_start);
   4058	}
   4059	spin_unlock(&purge_vmap_area_lock);
   4060}
   4061
   4062static int s_show(struct seq_file *m, void *p)
   4063{
   4064	struct vmap_area *va;
   4065	struct vm_struct *v;
   4066
   4067	va = list_entry(p, struct vmap_area, list);
   4068
   4069	/*
   4070	 * s_show can encounter race with remove_vm_area, !vm on behalf
   4071	 * of vmap area is being tear down or vm_map_ram allocation.
   4072	 */
   4073	if (!va->vm) {
   4074		seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
   4075			(void *)va->va_start, (void *)va->va_end,
   4076			va->va_end - va->va_start);
   4077
   4078		goto final;
   4079	}
   4080
   4081	v = va->vm;
   4082
   4083	seq_printf(m, "0x%pK-0x%pK %7ld",
   4084		v->addr, v->addr + v->size, v->size);
   4085
   4086	if (v->caller)
   4087		seq_printf(m, " %pS", v->caller);
   4088
   4089	if (v->nr_pages)
   4090		seq_printf(m, " pages=%d", v->nr_pages);
   4091
   4092	if (v->phys_addr)
   4093		seq_printf(m, " phys=%pa", &v->phys_addr);
   4094
   4095	if (v->flags & VM_IOREMAP)
   4096		seq_puts(m, " ioremap");
   4097
   4098	if (v->flags & VM_ALLOC)
   4099		seq_puts(m, " vmalloc");
   4100
   4101	if (v->flags & VM_MAP)
   4102		seq_puts(m, " vmap");
   4103
   4104	if (v->flags & VM_USERMAP)
   4105		seq_puts(m, " user");
   4106
   4107	if (v->flags & VM_DMA_COHERENT)
   4108		seq_puts(m, " dma-coherent");
   4109
   4110	if (is_vmalloc_addr(v->pages))
   4111		seq_puts(m, " vpages");
   4112
   4113	show_numa_info(m, v);
   4114	seq_putc(m, '\n');
   4115
   4116	/*
   4117	 * As a final step, dump "unpurged" areas.
   4118	 */
   4119final:
   4120	if (list_is_last(&va->list, &vmap_area_list))
   4121		show_purge_info(m);
   4122
   4123	return 0;
   4124}
   4125
   4126static const struct seq_operations vmalloc_op = {
   4127	.start = s_start,
   4128	.next = s_next,
   4129	.stop = s_stop,
   4130	.show = s_show,
   4131};
   4132
   4133static int __init proc_vmalloc_init(void)
   4134{
   4135	if (IS_ENABLED(CONFIG_NUMA))
   4136		proc_create_seq_private("vmallocinfo", 0400, NULL,
   4137				&vmalloc_op,
   4138				nr_node_ids * sizeof(unsigned int), NULL);
   4139	else
   4140		proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
   4141	return 0;
   4142}
   4143module_init(proc_vmalloc_init);
   4144
   4145#endif