cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mempolicy.c (79697B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Simple NUMA memory policy for the Linux kernel.
      4 *
      5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
      6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
      7 *
      8 * NUMA policy allows the user to give hints in which node(s) memory should
      9 * be allocated.
     10 *
     11 * Support four policies per VMA and per process:
     12 *
     13 * The VMA policy has priority over the process policy for a page fault.
     14 *
     15 * interleave     Allocate memory interleaved over a set of nodes,
     16 *                with normal fallback if it fails.
     17 *                For VMA based allocations this interleaves based on the
     18 *                offset into the backing object or offset into the mapping
     19 *                for anonymous memory. For process policy an process counter
     20 *                is used.
     21 *
     22 * bind           Only allocate memory on a specific set of nodes,
     23 *                no fallback.
     24 *                FIXME: memory is allocated starting with the first node
     25 *                to the last. It would be better if bind would truly restrict
     26 *                the allocation to memory nodes instead
     27 *
     28 * preferred       Try a specific node first before normal fallback.
     29 *                As a special case NUMA_NO_NODE here means do the allocation
     30 *                on the local CPU. This is normally identical to default,
     31 *                but useful to set in a VMA when you have a non default
     32 *                process policy.
     33 *
     34 * preferred many Try a set of nodes first before normal fallback. This is
     35 *                similar to preferred without the special case.
     36 *
     37 * default        Allocate on the local node first, or when on a VMA
     38 *                use the process policy. This is what Linux always did
     39 *		  in a NUMA aware kernel and still does by, ahem, default.
     40 *
     41 * The process policy is applied for most non interrupt memory allocations
     42 * in that process' context. Interrupts ignore the policies and always
     43 * try to allocate on the local CPU. The VMA policy is only applied for memory
     44 * allocations for a VMA in the VM.
     45 *
     46 * Currently there are a few corner cases in swapping where the policy
     47 * is not applied, but the majority should be handled. When process policy
     48 * is used it is not remembered over swap outs/swap ins.
     49 *
     50 * Only the highest zone in the zone hierarchy gets policied. Allocations
     51 * requesting a lower zone just use default policy. This implies that
     52 * on systems with highmem kernel lowmem allocation don't get policied.
     53 * Same with GFP_DMA allocations.
     54 *
     55 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
     56 * all users and remembered even when nobody has memory mapped.
     57 */
     58
     59/* Notebook:
     60   fix mmap readahead to honour policy and enable policy for any page cache
     61   object
     62   statistics for bigpages
     63   global policy for page cache? currently it uses process policy. Requires
     64   first item above.
     65   handle mremap for shared memory (currently ignored for the policy)
     66   grows down?
     67   make bind policy root only? It can trigger oom much faster and the
     68   kernel is not always grateful with that.
     69*/
     70
     71#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     72
     73#include <linux/mempolicy.h>
     74#include <linux/pagewalk.h>
     75#include <linux/highmem.h>
     76#include <linux/hugetlb.h>
     77#include <linux/kernel.h>
     78#include <linux/sched.h>
     79#include <linux/sched/mm.h>
     80#include <linux/sched/numa_balancing.h>
     81#include <linux/sched/task.h>
     82#include <linux/nodemask.h>
     83#include <linux/cpuset.h>
     84#include <linux/slab.h>
     85#include <linux/string.h>
     86#include <linux/export.h>
     87#include <linux/nsproxy.h>
     88#include <linux/interrupt.h>
     89#include <linux/init.h>
     90#include <linux/compat.h>
     91#include <linux/ptrace.h>
     92#include <linux/swap.h>
     93#include <linux/seq_file.h>
     94#include <linux/proc_fs.h>
     95#include <linux/migrate.h>
     96#include <linux/ksm.h>
     97#include <linux/rmap.h>
     98#include <linux/security.h>
     99#include <linux/syscalls.h>
    100#include <linux/ctype.h>
    101#include <linux/mm_inline.h>
    102#include <linux/mmu_notifier.h>
    103#include <linux/printk.h>
    104#include <linux/swapops.h>
    105
    106#include <asm/tlbflush.h>
    107#include <asm/tlb.h>
    108#include <linux/uaccess.h>
    109
    110#include "internal.h"
    111
    112/* Internal flags */
    113#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
    114#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
    115
    116static struct kmem_cache *policy_cache;
    117static struct kmem_cache *sn_cache;
    118
    119/* Highest zone. An specific allocation for a zone below that is not
    120   policied. */
    121enum zone_type policy_zone = 0;
    122
    123/*
    124 * run-time system-wide default policy => local allocation
    125 */
    126static struct mempolicy default_policy = {
    127	.refcnt = ATOMIC_INIT(1), /* never free it */
    128	.mode = MPOL_LOCAL,
    129};
    130
    131static struct mempolicy preferred_node_policy[MAX_NUMNODES];
    132
    133/**
    134 * numa_map_to_online_node - Find closest online node
    135 * @node: Node id to start the search
    136 *
    137 * Lookup the next closest node by distance if @nid is not online.
    138 *
    139 * Return: this @node if it is online, otherwise the closest node by distance
    140 */
    141int numa_map_to_online_node(int node)
    142{
    143	int min_dist = INT_MAX, dist, n, min_node;
    144
    145	if (node == NUMA_NO_NODE || node_online(node))
    146		return node;
    147
    148	min_node = node;
    149	for_each_online_node(n) {
    150		dist = node_distance(node, n);
    151		if (dist < min_dist) {
    152			min_dist = dist;
    153			min_node = n;
    154		}
    155	}
    156
    157	return min_node;
    158}
    159EXPORT_SYMBOL_GPL(numa_map_to_online_node);
    160
    161struct mempolicy *get_task_policy(struct task_struct *p)
    162{
    163	struct mempolicy *pol = p->mempolicy;
    164	int node;
    165
    166	if (pol)
    167		return pol;
    168
    169	node = numa_node_id();
    170	if (node != NUMA_NO_NODE) {
    171		pol = &preferred_node_policy[node];
    172		/* preferred_node_policy is not initialised early in boot */
    173		if (pol->mode)
    174			return pol;
    175	}
    176
    177	return &default_policy;
    178}
    179
    180static const struct mempolicy_operations {
    181	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
    182	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
    183} mpol_ops[MPOL_MAX];
    184
    185static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
    186{
    187	return pol->flags & MPOL_MODE_FLAGS;
    188}
    189
    190static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
    191				   const nodemask_t *rel)
    192{
    193	nodemask_t tmp;
    194	nodes_fold(tmp, *orig, nodes_weight(*rel));
    195	nodes_onto(*ret, tmp, *rel);
    196}
    197
    198static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
    199{
    200	if (nodes_empty(*nodes))
    201		return -EINVAL;
    202	pol->nodes = *nodes;
    203	return 0;
    204}
    205
    206static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
    207{
    208	if (nodes_empty(*nodes))
    209		return -EINVAL;
    210
    211	nodes_clear(pol->nodes);
    212	node_set(first_node(*nodes), pol->nodes);
    213	return 0;
    214}
    215
    216/*
    217 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
    218 * any, for the new policy.  mpol_new() has already validated the nodes
    219 * parameter with respect to the policy mode and flags.
    220 *
    221 * Must be called holding task's alloc_lock to protect task's mems_allowed
    222 * and mempolicy.  May also be called holding the mmap_lock for write.
    223 */
    224static int mpol_set_nodemask(struct mempolicy *pol,
    225		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
    226{
    227	int ret;
    228
    229	/*
    230	 * Default (pol==NULL) resp. local memory policies are not a
    231	 * subject of any remapping. They also do not need any special
    232	 * constructor.
    233	 */
    234	if (!pol || pol->mode == MPOL_LOCAL)
    235		return 0;
    236
    237	/* Check N_MEMORY */
    238	nodes_and(nsc->mask1,
    239		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
    240
    241	VM_BUG_ON(!nodes);
    242
    243	if (pol->flags & MPOL_F_RELATIVE_NODES)
    244		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
    245	else
    246		nodes_and(nsc->mask2, *nodes, nsc->mask1);
    247
    248	if (mpol_store_user_nodemask(pol))
    249		pol->w.user_nodemask = *nodes;
    250	else
    251		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
    252
    253	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
    254	return ret;
    255}
    256
    257/*
    258 * This function just creates a new policy, does some check and simple
    259 * initialization. You must invoke mpol_set_nodemask() to set nodes.
    260 */
    261static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
    262				  nodemask_t *nodes)
    263{
    264	struct mempolicy *policy;
    265
    266	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
    267		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
    268
    269	if (mode == MPOL_DEFAULT) {
    270		if (nodes && !nodes_empty(*nodes))
    271			return ERR_PTR(-EINVAL);
    272		return NULL;
    273	}
    274	VM_BUG_ON(!nodes);
    275
    276	/*
    277	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
    278	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
    279	 * All other modes require a valid pointer to a non-empty nodemask.
    280	 */
    281	if (mode == MPOL_PREFERRED) {
    282		if (nodes_empty(*nodes)) {
    283			if (((flags & MPOL_F_STATIC_NODES) ||
    284			     (flags & MPOL_F_RELATIVE_NODES)))
    285				return ERR_PTR(-EINVAL);
    286
    287			mode = MPOL_LOCAL;
    288		}
    289	} else if (mode == MPOL_LOCAL) {
    290		if (!nodes_empty(*nodes) ||
    291		    (flags & MPOL_F_STATIC_NODES) ||
    292		    (flags & MPOL_F_RELATIVE_NODES))
    293			return ERR_PTR(-EINVAL);
    294	} else if (nodes_empty(*nodes))
    295		return ERR_PTR(-EINVAL);
    296	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
    297	if (!policy)
    298		return ERR_PTR(-ENOMEM);
    299	atomic_set(&policy->refcnt, 1);
    300	policy->mode = mode;
    301	policy->flags = flags;
    302	policy->home_node = NUMA_NO_NODE;
    303
    304	return policy;
    305}
    306
    307/* Slow path of a mpol destructor. */
    308void __mpol_put(struct mempolicy *p)
    309{
    310	if (!atomic_dec_and_test(&p->refcnt))
    311		return;
    312	kmem_cache_free(policy_cache, p);
    313}
    314
    315static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
    316{
    317}
    318
    319static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
    320{
    321	nodemask_t tmp;
    322
    323	if (pol->flags & MPOL_F_STATIC_NODES)
    324		nodes_and(tmp, pol->w.user_nodemask, *nodes);
    325	else if (pol->flags & MPOL_F_RELATIVE_NODES)
    326		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
    327	else {
    328		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
    329								*nodes);
    330		pol->w.cpuset_mems_allowed = *nodes;
    331	}
    332
    333	if (nodes_empty(tmp))
    334		tmp = *nodes;
    335
    336	pol->nodes = tmp;
    337}
    338
    339static void mpol_rebind_preferred(struct mempolicy *pol,
    340						const nodemask_t *nodes)
    341{
    342	pol->w.cpuset_mems_allowed = *nodes;
    343}
    344
    345/*
    346 * mpol_rebind_policy - Migrate a policy to a different set of nodes
    347 *
    348 * Per-vma policies are protected by mmap_lock. Allocations using per-task
    349 * policies are protected by task->mems_allowed_seq to prevent a premature
    350 * OOM/allocation failure due to parallel nodemask modification.
    351 */
    352static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
    353{
    354	if (!pol || pol->mode == MPOL_LOCAL)
    355		return;
    356	if (!mpol_store_user_nodemask(pol) &&
    357	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
    358		return;
    359
    360	mpol_ops[pol->mode].rebind(pol, newmask);
    361}
    362
    363/*
    364 * Wrapper for mpol_rebind_policy() that just requires task
    365 * pointer, and updates task mempolicy.
    366 *
    367 * Called with task's alloc_lock held.
    368 */
    369
    370void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
    371{
    372	mpol_rebind_policy(tsk->mempolicy, new);
    373}
    374
    375/*
    376 * Rebind each vma in mm to new nodemask.
    377 *
    378 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
    379 */
    380
    381void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
    382{
    383	struct vm_area_struct *vma;
    384
    385	mmap_write_lock(mm);
    386	for (vma = mm->mmap; vma; vma = vma->vm_next)
    387		mpol_rebind_policy(vma->vm_policy, new);
    388	mmap_write_unlock(mm);
    389}
    390
    391static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
    392	[MPOL_DEFAULT] = {
    393		.rebind = mpol_rebind_default,
    394	},
    395	[MPOL_INTERLEAVE] = {
    396		.create = mpol_new_nodemask,
    397		.rebind = mpol_rebind_nodemask,
    398	},
    399	[MPOL_PREFERRED] = {
    400		.create = mpol_new_preferred,
    401		.rebind = mpol_rebind_preferred,
    402	},
    403	[MPOL_BIND] = {
    404		.create = mpol_new_nodemask,
    405		.rebind = mpol_rebind_nodemask,
    406	},
    407	[MPOL_LOCAL] = {
    408		.rebind = mpol_rebind_default,
    409	},
    410	[MPOL_PREFERRED_MANY] = {
    411		.create = mpol_new_nodemask,
    412		.rebind = mpol_rebind_preferred,
    413	},
    414};
    415
    416static int migrate_page_add(struct page *page, struct list_head *pagelist,
    417				unsigned long flags);
    418
    419struct queue_pages {
    420	struct list_head *pagelist;
    421	unsigned long flags;
    422	nodemask_t *nmask;
    423	unsigned long start;
    424	unsigned long end;
    425	struct vm_area_struct *first;
    426};
    427
    428/*
    429 * Check if the page's nid is in qp->nmask.
    430 *
    431 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
    432 * in the invert of qp->nmask.
    433 */
    434static inline bool queue_pages_required(struct page *page,
    435					struct queue_pages *qp)
    436{
    437	int nid = page_to_nid(page);
    438	unsigned long flags = qp->flags;
    439
    440	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
    441}
    442
    443/*
    444 * queue_pages_pmd() has three possible return values:
    445 * 0 - pages are placed on the right node or queued successfully, or
    446 *     special page is met, i.e. huge zero page.
    447 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
    448 *     specified.
    449 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
    450 *        existing page was already on a node that does not follow the
    451 *        policy.
    452 */
    453static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
    454				unsigned long end, struct mm_walk *walk)
    455	__releases(ptl)
    456{
    457	int ret = 0;
    458	struct page *page;
    459	struct queue_pages *qp = walk->private;
    460	unsigned long flags;
    461
    462	if (unlikely(is_pmd_migration_entry(*pmd))) {
    463		ret = -EIO;
    464		goto unlock;
    465	}
    466	page = pmd_page(*pmd);
    467	if (is_huge_zero_page(page)) {
    468		spin_unlock(ptl);
    469		walk->action = ACTION_CONTINUE;
    470		goto out;
    471	}
    472	if (!queue_pages_required(page, qp))
    473		goto unlock;
    474
    475	flags = qp->flags;
    476	/* go to thp migration */
    477	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
    478		if (!vma_migratable(walk->vma) ||
    479		    migrate_page_add(page, qp->pagelist, flags)) {
    480			ret = 1;
    481			goto unlock;
    482		}
    483	} else
    484		ret = -EIO;
    485unlock:
    486	spin_unlock(ptl);
    487out:
    488	return ret;
    489}
    490
    491/*
    492 * Scan through pages checking if pages follow certain conditions,
    493 * and move them to the pagelist if they do.
    494 *
    495 * queue_pages_pte_range() has three possible return values:
    496 * 0 - pages are placed on the right node or queued successfully, or
    497 *     special page is met, i.e. zero page.
    498 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
    499 *     specified.
    500 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
    501 *        on a node that does not follow the policy.
    502 */
    503static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
    504			unsigned long end, struct mm_walk *walk)
    505{
    506	struct vm_area_struct *vma = walk->vma;
    507	struct page *page;
    508	struct queue_pages *qp = walk->private;
    509	unsigned long flags = qp->flags;
    510	bool has_unmovable = false;
    511	pte_t *pte, *mapped_pte;
    512	spinlock_t *ptl;
    513
    514	ptl = pmd_trans_huge_lock(pmd, vma);
    515	if (ptl)
    516		return queue_pages_pmd(pmd, ptl, addr, end, walk);
    517
    518	if (pmd_trans_unstable(pmd))
    519		return 0;
    520
    521	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
    522	for (; addr != end; pte++, addr += PAGE_SIZE) {
    523		if (!pte_present(*pte))
    524			continue;
    525		page = vm_normal_page(vma, addr, *pte);
    526		if (!page)
    527			continue;
    528		/*
    529		 * vm_normal_page() filters out zero pages, but there might
    530		 * still be PageReserved pages to skip, perhaps in a VDSO.
    531		 */
    532		if (PageReserved(page))
    533			continue;
    534		if (!queue_pages_required(page, qp))
    535			continue;
    536		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
    537			/* MPOL_MF_STRICT must be specified if we get here */
    538			if (!vma_migratable(vma)) {
    539				has_unmovable = true;
    540				break;
    541			}
    542
    543			/*
    544			 * Do not abort immediately since there may be
    545			 * temporary off LRU pages in the range.  Still
    546			 * need migrate other LRU pages.
    547			 */
    548			if (migrate_page_add(page, qp->pagelist, flags))
    549				has_unmovable = true;
    550		} else
    551			break;
    552	}
    553	pte_unmap_unlock(mapped_pte, ptl);
    554	cond_resched();
    555
    556	if (has_unmovable)
    557		return 1;
    558
    559	return addr != end ? -EIO : 0;
    560}
    561
    562static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
    563			       unsigned long addr, unsigned long end,
    564			       struct mm_walk *walk)
    565{
    566	int ret = 0;
    567#ifdef CONFIG_HUGETLB_PAGE
    568	struct queue_pages *qp = walk->private;
    569	unsigned long flags = (qp->flags & MPOL_MF_VALID);
    570	struct page *page;
    571	spinlock_t *ptl;
    572	pte_t entry;
    573
    574	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
    575	entry = huge_ptep_get(pte);
    576	if (!pte_present(entry))
    577		goto unlock;
    578	page = pte_page(entry);
    579	if (!queue_pages_required(page, qp))
    580		goto unlock;
    581
    582	if (flags == MPOL_MF_STRICT) {
    583		/*
    584		 * STRICT alone means only detecting misplaced page and no
    585		 * need to further check other vma.
    586		 */
    587		ret = -EIO;
    588		goto unlock;
    589	}
    590
    591	if (!vma_migratable(walk->vma)) {
    592		/*
    593		 * Must be STRICT with MOVE*, otherwise .test_walk() have
    594		 * stopped walking current vma.
    595		 * Detecting misplaced page but allow migrating pages which
    596		 * have been queued.
    597		 */
    598		ret = 1;
    599		goto unlock;
    600	}
    601
    602	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
    603	if (flags & (MPOL_MF_MOVE_ALL) ||
    604	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
    605		if (!isolate_huge_page(page, qp->pagelist) &&
    606			(flags & MPOL_MF_STRICT))
    607			/*
    608			 * Failed to isolate page but allow migrating pages
    609			 * which have been queued.
    610			 */
    611			ret = 1;
    612	}
    613unlock:
    614	spin_unlock(ptl);
    615#else
    616	BUG();
    617#endif
    618	return ret;
    619}
    620
    621#ifdef CONFIG_NUMA_BALANCING
    622/*
    623 * This is used to mark a range of virtual addresses to be inaccessible.
    624 * These are later cleared by a NUMA hinting fault. Depending on these
    625 * faults, pages may be migrated for better NUMA placement.
    626 *
    627 * This is assuming that NUMA faults are handled using PROT_NONE. If
    628 * an architecture makes a different choice, it will need further
    629 * changes to the core.
    630 */
    631unsigned long change_prot_numa(struct vm_area_struct *vma,
    632			unsigned long addr, unsigned long end)
    633{
    634	struct mmu_gather tlb;
    635	int nr_updated;
    636
    637	tlb_gather_mmu(&tlb, vma->vm_mm);
    638
    639	nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE,
    640				       MM_CP_PROT_NUMA);
    641	if (nr_updated)
    642		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
    643
    644	tlb_finish_mmu(&tlb);
    645
    646	return nr_updated;
    647}
    648#else
    649static unsigned long change_prot_numa(struct vm_area_struct *vma,
    650			unsigned long addr, unsigned long end)
    651{
    652	return 0;
    653}
    654#endif /* CONFIG_NUMA_BALANCING */
    655
    656static int queue_pages_test_walk(unsigned long start, unsigned long end,
    657				struct mm_walk *walk)
    658{
    659	struct vm_area_struct *vma = walk->vma;
    660	struct queue_pages *qp = walk->private;
    661	unsigned long endvma = vma->vm_end;
    662	unsigned long flags = qp->flags;
    663
    664	/* range check first */
    665	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
    666
    667	if (!qp->first) {
    668		qp->first = vma;
    669		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
    670			(qp->start < vma->vm_start))
    671			/* hole at head side of range */
    672			return -EFAULT;
    673	}
    674	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
    675		((vma->vm_end < qp->end) &&
    676		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
    677		/* hole at middle or tail of range */
    678		return -EFAULT;
    679
    680	/*
    681	 * Need check MPOL_MF_STRICT to return -EIO if possible
    682	 * regardless of vma_migratable
    683	 */
    684	if (!vma_migratable(vma) &&
    685	    !(flags & MPOL_MF_STRICT))
    686		return 1;
    687
    688	if (endvma > end)
    689		endvma = end;
    690
    691	if (flags & MPOL_MF_LAZY) {
    692		/* Similar to task_numa_work, skip inaccessible VMAs */
    693		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
    694			!(vma->vm_flags & VM_MIXEDMAP))
    695			change_prot_numa(vma, start, endvma);
    696		return 1;
    697	}
    698
    699	/* queue pages from current vma */
    700	if (flags & MPOL_MF_VALID)
    701		return 0;
    702	return 1;
    703}
    704
    705static const struct mm_walk_ops queue_pages_walk_ops = {
    706	.hugetlb_entry		= queue_pages_hugetlb,
    707	.pmd_entry		= queue_pages_pte_range,
    708	.test_walk		= queue_pages_test_walk,
    709};
    710
    711/*
    712 * Walk through page tables and collect pages to be migrated.
    713 *
    714 * If pages found in a given range are on a set of nodes (determined by
    715 * @nodes and @flags,) it's isolated and queued to the pagelist which is
    716 * passed via @private.
    717 *
    718 * queue_pages_range() has three possible return values:
    719 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
    720 *     specified.
    721 * 0 - queue pages successfully or no misplaced page.
    722 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
    723 *         memory range specified by nodemask and maxnode points outside
    724 *         your accessible address space (-EFAULT)
    725 */
    726static int
    727queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
    728		nodemask_t *nodes, unsigned long flags,
    729		struct list_head *pagelist)
    730{
    731	int err;
    732	struct queue_pages qp = {
    733		.pagelist = pagelist,
    734		.flags = flags,
    735		.nmask = nodes,
    736		.start = start,
    737		.end = end,
    738		.first = NULL,
    739	};
    740
    741	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
    742
    743	if (!qp.first)
    744		/* whole range in hole */
    745		err = -EFAULT;
    746
    747	return err;
    748}
    749
    750/*
    751 * Apply policy to a single VMA
    752 * This must be called with the mmap_lock held for writing.
    753 */
    754static int vma_replace_policy(struct vm_area_struct *vma,
    755						struct mempolicy *pol)
    756{
    757	int err;
    758	struct mempolicy *old;
    759	struct mempolicy *new;
    760
    761	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
    762		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
    763		 vma->vm_ops, vma->vm_file,
    764		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
    765
    766	new = mpol_dup(pol);
    767	if (IS_ERR(new))
    768		return PTR_ERR(new);
    769
    770	if (vma->vm_ops && vma->vm_ops->set_policy) {
    771		err = vma->vm_ops->set_policy(vma, new);
    772		if (err)
    773			goto err_out;
    774	}
    775
    776	old = vma->vm_policy;
    777	vma->vm_policy = new; /* protected by mmap_lock */
    778	mpol_put(old);
    779
    780	return 0;
    781 err_out:
    782	mpol_put(new);
    783	return err;
    784}
    785
    786/* Step 2: apply policy to a range and do splits. */
    787static int mbind_range(struct mm_struct *mm, unsigned long start,
    788		       unsigned long end, struct mempolicy *new_pol)
    789{
    790	struct vm_area_struct *prev;
    791	struct vm_area_struct *vma;
    792	int err = 0;
    793	pgoff_t pgoff;
    794	unsigned long vmstart;
    795	unsigned long vmend;
    796
    797	vma = find_vma(mm, start);
    798	VM_BUG_ON(!vma);
    799
    800	prev = vma->vm_prev;
    801	if (start > vma->vm_start)
    802		prev = vma;
    803
    804	for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
    805		vmstart = max(start, vma->vm_start);
    806		vmend   = min(end, vma->vm_end);
    807
    808		if (mpol_equal(vma_policy(vma), new_pol))
    809			continue;
    810
    811		pgoff = vma->vm_pgoff +
    812			((vmstart - vma->vm_start) >> PAGE_SHIFT);
    813		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
    814				 vma->anon_vma, vma->vm_file, pgoff,
    815				 new_pol, vma->vm_userfaultfd_ctx,
    816				 anon_vma_name(vma));
    817		if (prev) {
    818			vma = prev;
    819			goto replace;
    820		}
    821		if (vma->vm_start != vmstart) {
    822			err = split_vma(vma->vm_mm, vma, vmstart, 1);
    823			if (err)
    824				goto out;
    825		}
    826		if (vma->vm_end != vmend) {
    827			err = split_vma(vma->vm_mm, vma, vmend, 0);
    828			if (err)
    829				goto out;
    830		}
    831 replace:
    832		err = vma_replace_policy(vma, new_pol);
    833		if (err)
    834			goto out;
    835	}
    836
    837 out:
    838	return err;
    839}
    840
    841/* Set the process memory policy */
    842static long do_set_mempolicy(unsigned short mode, unsigned short flags,
    843			     nodemask_t *nodes)
    844{
    845	struct mempolicy *new, *old;
    846	NODEMASK_SCRATCH(scratch);
    847	int ret;
    848
    849	if (!scratch)
    850		return -ENOMEM;
    851
    852	new = mpol_new(mode, flags, nodes);
    853	if (IS_ERR(new)) {
    854		ret = PTR_ERR(new);
    855		goto out;
    856	}
    857
    858	ret = mpol_set_nodemask(new, nodes, scratch);
    859	if (ret) {
    860		mpol_put(new);
    861		goto out;
    862	}
    863	task_lock(current);
    864	old = current->mempolicy;
    865	current->mempolicy = new;
    866	if (new && new->mode == MPOL_INTERLEAVE)
    867		current->il_prev = MAX_NUMNODES-1;
    868	task_unlock(current);
    869	mpol_put(old);
    870	ret = 0;
    871out:
    872	NODEMASK_SCRATCH_FREE(scratch);
    873	return ret;
    874}
    875
    876/*
    877 * Return nodemask for policy for get_mempolicy() query
    878 *
    879 * Called with task's alloc_lock held
    880 */
    881static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
    882{
    883	nodes_clear(*nodes);
    884	if (p == &default_policy)
    885		return;
    886
    887	switch (p->mode) {
    888	case MPOL_BIND:
    889	case MPOL_INTERLEAVE:
    890	case MPOL_PREFERRED:
    891	case MPOL_PREFERRED_MANY:
    892		*nodes = p->nodes;
    893		break;
    894	case MPOL_LOCAL:
    895		/* return empty node mask for local allocation */
    896		break;
    897	default:
    898		BUG();
    899	}
    900}
    901
    902static int lookup_node(struct mm_struct *mm, unsigned long addr)
    903{
    904	struct page *p = NULL;
    905	int ret;
    906
    907	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
    908	if (ret > 0) {
    909		ret = page_to_nid(p);
    910		put_page(p);
    911	}
    912	return ret;
    913}
    914
    915/* Retrieve NUMA policy */
    916static long do_get_mempolicy(int *policy, nodemask_t *nmask,
    917			     unsigned long addr, unsigned long flags)
    918{
    919	int err;
    920	struct mm_struct *mm = current->mm;
    921	struct vm_area_struct *vma = NULL;
    922	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
    923
    924	if (flags &
    925		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
    926		return -EINVAL;
    927
    928	if (flags & MPOL_F_MEMS_ALLOWED) {
    929		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
    930			return -EINVAL;
    931		*policy = 0;	/* just so it's initialized */
    932		task_lock(current);
    933		*nmask  = cpuset_current_mems_allowed;
    934		task_unlock(current);
    935		return 0;
    936	}
    937
    938	if (flags & MPOL_F_ADDR) {
    939		/*
    940		 * Do NOT fall back to task policy if the
    941		 * vma/shared policy at addr is NULL.  We
    942		 * want to return MPOL_DEFAULT in this case.
    943		 */
    944		mmap_read_lock(mm);
    945		vma = vma_lookup(mm, addr);
    946		if (!vma) {
    947			mmap_read_unlock(mm);
    948			return -EFAULT;
    949		}
    950		if (vma->vm_ops && vma->vm_ops->get_policy)
    951			pol = vma->vm_ops->get_policy(vma, addr);
    952		else
    953			pol = vma->vm_policy;
    954	} else if (addr)
    955		return -EINVAL;
    956
    957	if (!pol)
    958		pol = &default_policy;	/* indicates default behavior */
    959
    960	if (flags & MPOL_F_NODE) {
    961		if (flags & MPOL_F_ADDR) {
    962			/*
    963			 * Take a refcount on the mpol, because we are about to
    964			 * drop the mmap_lock, after which only "pol" remains
    965			 * valid, "vma" is stale.
    966			 */
    967			pol_refcount = pol;
    968			vma = NULL;
    969			mpol_get(pol);
    970			mmap_read_unlock(mm);
    971			err = lookup_node(mm, addr);
    972			if (err < 0)
    973				goto out;
    974			*policy = err;
    975		} else if (pol == current->mempolicy &&
    976				pol->mode == MPOL_INTERLEAVE) {
    977			*policy = next_node_in(current->il_prev, pol->nodes);
    978		} else {
    979			err = -EINVAL;
    980			goto out;
    981		}
    982	} else {
    983		*policy = pol == &default_policy ? MPOL_DEFAULT :
    984						pol->mode;
    985		/*
    986		 * Internal mempolicy flags must be masked off before exposing
    987		 * the policy to userspace.
    988		 */
    989		*policy |= (pol->flags & MPOL_MODE_FLAGS);
    990	}
    991
    992	err = 0;
    993	if (nmask) {
    994		if (mpol_store_user_nodemask(pol)) {
    995			*nmask = pol->w.user_nodemask;
    996		} else {
    997			task_lock(current);
    998			get_policy_nodemask(pol, nmask);
    999			task_unlock(current);
   1000		}
   1001	}
   1002
   1003 out:
   1004	mpol_cond_put(pol);
   1005	if (vma)
   1006		mmap_read_unlock(mm);
   1007	if (pol_refcount)
   1008		mpol_put(pol_refcount);
   1009	return err;
   1010}
   1011
   1012#ifdef CONFIG_MIGRATION
   1013/*
   1014 * page migration, thp tail pages can be passed.
   1015 */
   1016static int migrate_page_add(struct page *page, struct list_head *pagelist,
   1017				unsigned long flags)
   1018{
   1019	struct page *head = compound_head(page);
   1020	/*
   1021	 * Avoid migrating a page that is shared with others.
   1022	 */
   1023	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
   1024		if (!isolate_lru_page(head)) {
   1025			list_add_tail(&head->lru, pagelist);
   1026			mod_node_page_state(page_pgdat(head),
   1027				NR_ISOLATED_ANON + page_is_file_lru(head),
   1028				thp_nr_pages(head));
   1029		} else if (flags & MPOL_MF_STRICT) {
   1030			/*
   1031			 * Non-movable page may reach here.  And, there may be
   1032			 * temporary off LRU pages or non-LRU movable pages.
   1033			 * Treat them as unmovable pages since they can't be
   1034			 * isolated, so they can't be moved at the moment.  It
   1035			 * should return -EIO for this case too.
   1036			 */
   1037			return -EIO;
   1038		}
   1039	}
   1040
   1041	return 0;
   1042}
   1043
   1044/*
   1045 * Migrate pages from one node to a target node.
   1046 * Returns error or the number of pages not migrated.
   1047 */
   1048static int migrate_to_node(struct mm_struct *mm, int source, int dest,
   1049			   int flags)
   1050{
   1051	nodemask_t nmask;
   1052	LIST_HEAD(pagelist);
   1053	int err = 0;
   1054	struct migration_target_control mtc = {
   1055		.nid = dest,
   1056		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
   1057	};
   1058
   1059	nodes_clear(nmask);
   1060	node_set(source, nmask);
   1061
   1062	/*
   1063	 * This does not "check" the range but isolates all pages that
   1064	 * need migration.  Between passing in the full user address
   1065	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
   1066	 */
   1067	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
   1068	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
   1069			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
   1070
   1071	if (!list_empty(&pagelist)) {
   1072		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
   1073				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
   1074		if (err)
   1075			putback_movable_pages(&pagelist);
   1076	}
   1077
   1078	return err;
   1079}
   1080
   1081/*
   1082 * Move pages between the two nodesets so as to preserve the physical
   1083 * layout as much as possible.
   1084 *
   1085 * Returns the number of page that could not be moved.
   1086 */
   1087int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
   1088		     const nodemask_t *to, int flags)
   1089{
   1090	int busy = 0;
   1091	int err = 0;
   1092	nodemask_t tmp;
   1093
   1094	lru_cache_disable();
   1095
   1096	mmap_read_lock(mm);
   1097
   1098	/*
   1099	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
   1100	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
   1101	 * bit in 'tmp', and return that <source, dest> pair for migration.
   1102	 * The pair of nodemasks 'to' and 'from' define the map.
   1103	 *
   1104	 * If no pair of bits is found that way, fallback to picking some
   1105	 * pair of 'source' and 'dest' bits that are not the same.  If the
   1106	 * 'source' and 'dest' bits are the same, this represents a node
   1107	 * that will be migrating to itself, so no pages need move.
   1108	 *
   1109	 * If no bits are left in 'tmp', or if all remaining bits left
   1110	 * in 'tmp' correspond to the same bit in 'to', return false
   1111	 * (nothing left to migrate).
   1112	 *
   1113	 * This lets us pick a pair of nodes to migrate between, such that
   1114	 * if possible the dest node is not already occupied by some other
   1115	 * source node, minimizing the risk of overloading the memory on a
   1116	 * node that would happen if we migrated incoming memory to a node
   1117	 * before migrating outgoing memory source that same node.
   1118	 *
   1119	 * A single scan of tmp is sufficient.  As we go, we remember the
   1120	 * most recent <s, d> pair that moved (s != d).  If we find a pair
   1121	 * that not only moved, but what's better, moved to an empty slot
   1122	 * (d is not set in tmp), then we break out then, with that pair.
   1123	 * Otherwise when we finish scanning from_tmp, we at least have the
   1124	 * most recent <s, d> pair that moved.  If we get all the way through
   1125	 * the scan of tmp without finding any node that moved, much less
   1126	 * moved to an empty node, then there is nothing left worth migrating.
   1127	 */
   1128
   1129	tmp = *from;
   1130	while (!nodes_empty(tmp)) {
   1131		int s, d;
   1132		int source = NUMA_NO_NODE;
   1133		int dest = 0;
   1134
   1135		for_each_node_mask(s, tmp) {
   1136
   1137			/*
   1138			 * do_migrate_pages() tries to maintain the relative
   1139			 * node relationship of the pages established between
   1140			 * threads and memory areas.
   1141                         *
   1142			 * However if the number of source nodes is not equal to
   1143			 * the number of destination nodes we can not preserve
   1144			 * this node relative relationship.  In that case, skip
   1145			 * copying memory from a node that is in the destination
   1146			 * mask.
   1147			 *
   1148			 * Example: [2,3,4] -> [3,4,5] moves everything.
   1149			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
   1150			 */
   1151
   1152			if ((nodes_weight(*from) != nodes_weight(*to)) &&
   1153						(node_isset(s, *to)))
   1154				continue;
   1155
   1156			d = node_remap(s, *from, *to);
   1157			if (s == d)
   1158				continue;
   1159
   1160			source = s;	/* Node moved. Memorize */
   1161			dest = d;
   1162
   1163			/* dest not in remaining from nodes? */
   1164			if (!node_isset(dest, tmp))
   1165				break;
   1166		}
   1167		if (source == NUMA_NO_NODE)
   1168			break;
   1169
   1170		node_clear(source, tmp);
   1171		err = migrate_to_node(mm, source, dest, flags);
   1172		if (err > 0)
   1173			busy += err;
   1174		if (err < 0)
   1175			break;
   1176	}
   1177	mmap_read_unlock(mm);
   1178
   1179	lru_cache_enable();
   1180	if (err < 0)
   1181		return err;
   1182	return busy;
   1183
   1184}
   1185
   1186/*
   1187 * Allocate a new page for page migration based on vma policy.
   1188 * Start by assuming the page is mapped by the same vma as contains @start.
   1189 * Search forward from there, if not.  N.B., this assumes that the
   1190 * list of pages handed to migrate_pages()--which is how we get here--
   1191 * is in virtual address order.
   1192 */
   1193static struct page *new_page(struct page *page, unsigned long start)
   1194{
   1195	struct folio *dst, *src = page_folio(page);
   1196	struct vm_area_struct *vma;
   1197	unsigned long address;
   1198	gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
   1199
   1200	vma = find_vma(current->mm, start);
   1201	while (vma) {
   1202		address = page_address_in_vma(page, vma);
   1203		if (address != -EFAULT)
   1204			break;
   1205		vma = vma->vm_next;
   1206	}
   1207
   1208	if (folio_test_hugetlb(src))
   1209		return alloc_huge_page_vma(page_hstate(&src->page),
   1210				vma, address);
   1211
   1212	if (folio_test_large(src))
   1213		gfp = GFP_TRANSHUGE;
   1214
   1215	/*
   1216	 * if !vma, vma_alloc_folio() will use task or system default policy
   1217	 */
   1218	dst = vma_alloc_folio(gfp, folio_order(src), vma, address,
   1219			folio_test_large(src));
   1220	return &dst->page;
   1221}
   1222#else
   1223
   1224static int migrate_page_add(struct page *page, struct list_head *pagelist,
   1225				unsigned long flags)
   1226{
   1227	return -EIO;
   1228}
   1229
   1230int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
   1231		     const nodemask_t *to, int flags)
   1232{
   1233	return -ENOSYS;
   1234}
   1235
   1236static struct page *new_page(struct page *page, unsigned long start)
   1237{
   1238	return NULL;
   1239}
   1240#endif
   1241
   1242static long do_mbind(unsigned long start, unsigned long len,
   1243		     unsigned short mode, unsigned short mode_flags,
   1244		     nodemask_t *nmask, unsigned long flags)
   1245{
   1246	struct mm_struct *mm = current->mm;
   1247	struct mempolicy *new;
   1248	unsigned long end;
   1249	int err;
   1250	int ret;
   1251	LIST_HEAD(pagelist);
   1252
   1253	if (flags & ~(unsigned long)MPOL_MF_VALID)
   1254		return -EINVAL;
   1255	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
   1256		return -EPERM;
   1257
   1258	if (start & ~PAGE_MASK)
   1259		return -EINVAL;
   1260
   1261	if (mode == MPOL_DEFAULT)
   1262		flags &= ~MPOL_MF_STRICT;
   1263
   1264	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
   1265	end = start + len;
   1266
   1267	if (end < start)
   1268		return -EINVAL;
   1269	if (end == start)
   1270		return 0;
   1271
   1272	new = mpol_new(mode, mode_flags, nmask);
   1273	if (IS_ERR(new))
   1274		return PTR_ERR(new);
   1275
   1276	if (flags & MPOL_MF_LAZY)
   1277		new->flags |= MPOL_F_MOF;
   1278
   1279	/*
   1280	 * If we are using the default policy then operation
   1281	 * on discontinuous address spaces is okay after all
   1282	 */
   1283	if (!new)
   1284		flags |= MPOL_MF_DISCONTIG_OK;
   1285
   1286	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
   1287		 start, start + len, mode, mode_flags,
   1288		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
   1289
   1290	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
   1291
   1292		lru_cache_disable();
   1293	}
   1294	{
   1295		NODEMASK_SCRATCH(scratch);
   1296		if (scratch) {
   1297			mmap_write_lock(mm);
   1298			err = mpol_set_nodemask(new, nmask, scratch);
   1299			if (err)
   1300				mmap_write_unlock(mm);
   1301		} else
   1302			err = -ENOMEM;
   1303		NODEMASK_SCRATCH_FREE(scratch);
   1304	}
   1305	if (err)
   1306		goto mpol_out;
   1307
   1308	ret = queue_pages_range(mm, start, end, nmask,
   1309			  flags | MPOL_MF_INVERT, &pagelist);
   1310
   1311	if (ret < 0) {
   1312		err = ret;
   1313		goto up_out;
   1314	}
   1315
   1316	err = mbind_range(mm, start, end, new);
   1317
   1318	if (!err) {
   1319		int nr_failed = 0;
   1320
   1321		if (!list_empty(&pagelist)) {
   1322			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
   1323			nr_failed = migrate_pages(&pagelist, new_page, NULL,
   1324				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
   1325			if (nr_failed)
   1326				putback_movable_pages(&pagelist);
   1327		}
   1328
   1329		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
   1330			err = -EIO;
   1331	} else {
   1332up_out:
   1333		if (!list_empty(&pagelist))
   1334			putback_movable_pages(&pagelist);
   1335	}
   1336
   1337	mmap_write_unlock(mm);
   1338mpol_out:
   1339	mpol_put(new);
   1340	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
   1341		lru_cache_enable();
   1342	return err;
   1343}
   1344
   1345/*
   1346 * User space interface with variable sized bitmaps for nodelists.
   1347 */
   1348static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
   1349		      unsigned long maxnode)
   1350{
   1351	unsigned long nlongs = BITS_TO_LONGS(maxnode);
   1352	int ret;
   1353
   1354	if (in_compat_syscall())
   1355		ret = compat_get_bitmap(mask,
   1356					(const compat_ulong_t __user *)nmask,
   1357					maxnode);
   1358	else
   1359		ret = copy_from_user(mask, nmask,
   1360				     nlongs * sizeof(unsigned long));
   1361
   1362	if (ret)
   1363		return -EFAULT;
   1364
   1365	if (maxnode % BITS_PER_LONG)
   1366		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
   1367
   1368	return 0;
   1369}
   1370
   1371/* Copy a node mask from user space. */
   1372static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
   1373		     unsigned long maxnode)
   1374{
   1375	--maxnode;
   1376	nodes_clear(*nodes);
   1377	if (maxnode == 0 || !nmask)
   1378		return 0;
   1379	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
   1380		return -EINVAL;
   1381
   1382	/*
   1383	 * When the user specified more nodes than supported just check
   1384	 * if the non supported part is all zero, one word at a time,
   1385	 * starting at the end.
   1386	 */
   1387	while (maxnode > MAX_NUMNODES) {
   1388		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
   1389		unsigned long t;
   1390
   1391		if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits))
   1392			return -EFAULT;
   1393
   1394		if (maxnode - bits >= MAX_NUMNODES) {
   1395			maxnode -= bits;
   1396		} else {
   1397			maxnode = MAX_NUMNODES;
   1398			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
   1399		}
   1400		if (t)
   1401			return -EINVAL;
   1402	}
   1403
   1404	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
   1405}
   1406
   1407/* Copy a kernel node mask to user space */
   1408static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
   1409			      nodemask_t *nodes)
   1410{
   1411	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
   1412	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
   1413	bool compat = in_compat_syscall();
   1414
   1415	if (compat)
   1416		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
   1417
   1418	if (copy > nbytes) {
   1419		if (copy > PAGE_SIZE)
   1420			return -EINVAL;
   1421		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
   1422			return -EFAULT;
   1423		copy = nbytes;
   1424		maxnode = nr_node_ids;
   1425	}
   1426
   1427	if (compat)
   1428		return compat_put_bitmap((compat_ulong_t __user *)mask,
   1429					 nodes_addr(*nodes), maxnode);
   1430
   1431	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
   1432}
   1433
   1434/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
   1435static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
   1436{
   1437	*flags = *mode & MPOL_MODE_FLAGS;
   1438	*mode &= ~MPOL_MODE_FLAGS;
   1439
   1440	if ((unsigned int)(*mode) >=  MPOL_MAX)
   1441		return -EINVAL;
   1442	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
   1443		return -EINVAL;
   1444	if (*flags & MPOL_F_NUMA_BALANCING) {
   1445		if (*mode != MPOL_BIND)
   1446			return -EINVAL;
   1447		*flags |= (MPOL_F_MOF | MPOL_F_MORON);
   1448	}
   1449	return 0;
   1450}
   1451
   1452static long kernel_mbind(unsigned long start, unsigned long len,
   1453			 unsigned long mode, const unsigned long __user *nmask,
   1454			 unsigned long maxnode, unsigned int flags)
   1455{
   1456	unsigned short mode_flags;
   1457	nodemask_t nodes;
   1458	int lmode = mode;
   1459	int err;
   1460
   1461	start = untagged_addr(start);
   1462	err = sanitize_mpol_flags(&lmode, &mode_flags);
   1463	if (err)
   1464		return err;
   1465
   1466	err = get_nodes(&nodes, nmask, maxnode);
   1467	if (err)
   1468		return err;
   1469
   1470	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
   1471}
   1472
   1473SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
   1474		unsigned long, home_node, unsigned long, flags)
   1475{
   1476	struct mm_struct *mm = current->mm;
   1477	struct vm_area_struct *vma;
   1478	struct mempolicy *new;
   1479	unsigned long vmstart;
   1480	unsigned long vmend;
   1481	unsigned long end;
   1482	int err = -ENOENT;
   1483
   1484	start = untagged_addr(start);
   1485	if (start & ~PAGE_MASK)
   1486		return -EINVAL;
   1487	/*
   1488	 * flags is used for future extension if any.
   1489	 */
   1490	if (flags != 0)
   1491		return -EINVAL;
   1492
   1493	/*
   1494	 * Check home_node is online to avoid accessing uninitialized
   1495	 * NODE_DATA.
   1496	 */
   1497	if (home_node >= MAX_NUMNODES || !node_online(home_node))
   1498		return -EINVAL;
   1499
   1500	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
   1501	end = start + len;
   1502
   1503	if (end < start)
   1504		return -EINVAL;
   1505	if (end == start)
   1506		return 0;
   1507	mmap_write_lock(mm);
   1508	vma = find_vma(mm, start);
   1509	for (; vma && vma->vm_start < end;  vma = vma->vm_next) {
   1510
   1511		vmstart = max(start, vma->vm_start);
   1512		vmend   = min(end, vma->vm_end);
   1513		new = mpol_dup(vma_policy(vma));
   1514		if (IS_ERR(new)) {
   1515			err = PTR_ERR(new);
   1516			break;
   1517		}
   1518		/*
   1519		 * Only update home node if there is an existing vma policy
   1520		 */
   1521		if (!new)
   1522			continue;
   1523
   1524		/*
   1525		 * If any vma in the range got policy other than MPOL_BIND
   1526		 * or MPOL_PREFERRED_MANY we return error. We don't reset
   1527		 * the home node for vmas we already updated before.
   1528		 */
   1529		if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
   1530			err = -EOPNOTSUPP;
   1531			break;
   1532		}
   1533
   1534		new->home_node = home_node;
   1535		err = mbind_range(mm, vmstart, vmend, new);
   1536		mpol_put(new);
   1537		if (err)
   1538			break;
   1539	}
   1540	mmap_write_unlock(mm);
   1541	return err;
   1542}
   1543
   1544SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
   1545		unsigned long, mode, const unsigned long __user *, nmask,
   1546		unsigned long, maxnode, unsigned int, flags)
   1547{
   1548	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
   1549}
   1550
   1551/* Set the process memory policy */
   1552static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
   1553				 unsigned long maxnode)
   1554{
   1555	unsigned short mode_flags;
   1556	nodemask_t nodes;
   1557	int lmode = mode;
   1558	int err;
   1559
   1560	err = sanitize_mpol_flags(&lmode, &mode_flags);
   1561	if (err)
   1562		return err;
   1563
   1564	err = get_nodes(&nodes, nmask, maxnode);
   1565	if (err)
   1566		return err;
   1567
   1568	return do_set_mempolicy(lmode, mode_flags, &nodes);
   1569}
   1570
   1571SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
   1572		unsigned long, maxnode)
   1573{
   1574	return kernel_set_mempolicy(mode, nmask, maxnode);
   1575}
   1576
   1577static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
   1578				const unsigned long __user *old_nodes,
   1579				const unsigned long __user *new_nodes)
   1580{
   1581	struct mm_struct *mm = NULL;
   1582	struct task_struct *task;
   1583	nodemask_t task_nodes;
   1584	int err;
   1585	nodemask_t *old;
   1586	nodemask_t *new;
   1587	NODEMASK_SCRATCH(scratch);
   1588
   1589	if (!scratch)
   1590		return -ENOMEM;
   1591
   1592	old = &scratch->mask1;
   1593	new = &scratch->mask2;
   1594
   1595	err = get_nodes(old, old_nodes, maxnode);
   1596	if (err)
   1597		goto out;
   1598
   1599	err = get_nodes(new, new_nodes, maxnode);
   1600	if (err)
   1601		goto out;
   1602
   1603	/* Find the mm_struct */
   1604	rcu_read_lock();
   1605	task = pid ? find_task_by_vpid(pid) : current;
   1606	if (!task) {
   1607		rcu_read_unlock();
   1608		err = -ESRCH;
   1609		goto out;
   1610	}
   1611	get_task_struct(task);
   1612
   1613	err = -EINVAL;
   1614
   1615	/*
   1616	 * Check if this process has the right to modify the specified process.
   1617	 * Use the regular "ptrace_may_access()" checks.
   1618	 */
   1619	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
   1620		rcu_read_unlock();
   1621		err = -EPERM;
   1622		goto out_put;
   1623	}
   1624	rcu_read_unlock();
   1625
   1626	task_nodes = cpuset_mems_allowed(task);
   1627	/* Is the user allowed to access the target nodes? */
   1628	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
   1629		err = -EPERM;
   1630		goto out_put;
   1631	}
   1632
   1633	task_nodes = cpuset_mems_allowed(current);
   1634	nodes_and(*new, *new, task_nodes);
   1635	if (nodes_empty(*new))
   1636		goto out_put;
   1637
   1638	err = security_task_movememory(task);
   1639	if (err)
   1640		goto out_put;
   1641
   1642	mm = get_task_mm(task);
   1643	put_task_struct(task);
   1644
   1645	if (!mm) {
   1646		err = -EINVAL;
   1647		goto out;
   1648	}
   1649
   1650	err = do_migrate_pages(mm, old, new,
   1651		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
   1652
   1653	mmput(mm);
   1654out:
   1655	NODEMASK_SCRATCH_FREE(scratch);
   1656
   1657	return err;
   1658
   1659out_put:
   1660	put_task_struct(task);
   1661	goto out;
   1662
   1663}
   1664
   1665SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
   1666		const unsigned long __user *, old_nodes,
   1667		const unsigned long __user *, new_nodes)
   1668{
   1669	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
   1670}
   1671
   1672
   1673/* Retrieve NUMA policy */
   1674static int kernel_get_mempolicy(int __user *policy,
   1675				unsigned long __user *nmask,
   1676				unsigned long maxnode,
   1677				unsigned long addr,
   1678				unsigned long flags)
   1679{
   1680	int err;
   1681	int pval;
   1682	nodemask_t nodes;
   1683
   1684	if (nmask != NULL && maxnode < nr_node_ids)
   1685		return -EINVAL;
   1686
   1687	addr = untagged_addr(addr);
   1688
   1689	err = do_get_mempolicy(&pval, &nodes, addr, flags);
   1690
   1691	if (err)
   1692		return err;
   1693
   1694	if (policy && put_user(pval, policy))
   1695		return -EFAULT;
   1696
   1697	if (nmask)
   1698		err = copy_nodes_to_user(nmask, maxnode, &nodes);
   1699
   1700	return err;
   1701}
   1702
   1703SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
   1704		unsigned long __user *, nmask, unsigned long, maxnode,
   1705		unsigned long, addr, unsigned long, flags)
   1706{
   1707	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
   1708}
   1709
   1710bool vma_migratable(struct vm_area_struct *vma)
   1711{
   1712	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
   1713		return false;
   1714
   1715	/*
   1716	 * DAX device mappings require predictable access latency, so avoid
   1717	 * incurring periodic faults.
   1718	 */
   1719	if (vma_is_dax(vma))
   1720		return false;
   1721
   1722	if (is_vm_hugetlb_page(vma) &&
   1723		!hugepage_migration_supported(hstate_vma(vma)))
   1724		return false;
   1725
   1726	/*
   1727	 * Migration allocates pages in the highest zone. If we cannot
   1728	 * do so then migration (at least from node to node) is not
   1729	 * possible.
   1730	 */
   1731	if (vma->vm_file &&
   1732		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
   1733			< policy_zone)
   1734		return false;
   1735	return true;
   1736}
   1737
   1738struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
   1739						unsigned long addr)
   1740{
   1741	struct mempolicy *pol = NULL;
   1742
   1743	if (vma) {
   1744		if (vma->vm_ops && vma->vm_ops->get_policy) {
   1745			pol = vma->vm_ops->get_policy(vma, addr);
   1746		} else if (vma->vm_policy) {
   1747			pol = vma->vm_policy;
   1748
   1749			/*
   1750			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
   1751			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
   1752			 * count on these policies which will be dropped by
   1753			 * mpol_cond_put() later
   1754			 */
   1755			if (mpol_needs_cond_ref(pol))
   1756				mpol_get(pol);
   1757		}
   1758	}
   1759
   1760	return pol;
   1761}
   1762
   1763/*
   1764 * get_vma_policy(@vma, @addr)
   1765 * @vma: virtual memory area whose policy is sought
   1766 * @addr: address in @vma for shared policy lookup
   1767 *
   1768 * Returns effective policy for a VMA at specified address.
   1769 * Falls back to current->mempolicy or system default policy, as necessary.
   1770 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
   1771 * count--added by the get_policy() vm_op, as appropriate--to protect against
   1772 * freeing by another task.  It is the caller's responsibility to free the
   1773 * extra reference for shared policies.
   1774 */
   1775static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
   1776						unsigned long addr)
   1777{
   1778	struct mempolicy *pol = __get_vma_policy(vma, addr);
   1779
   1780	if (!pol)
   1781		pol = get_task_policy(current);
   1782
   1783	return pol;
   1784}
   1785
   1786bool vma_policy_mof(struct vm_area_struct *vma)
   1787{
   1788	struct mempolicy *pol;
   1789
   1790	if (vma->vm_ops && vma->vm_ops->get_policy) {
   1791		bool ret = false;
   1792
   1793		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
   1794		if (pol && (pol->flags & MPOL_F_MOF))
   1795			ret = true;
   1796		mpol_cond_put(pol);
   1797
   1798		return ret;
   1799	}
   1800
   1801	pol = vma->vm_policy;
   1802	if (!pol)
   1803		pol = get_task_policy(current);
   1804
   1805	return pol->flags & MPOL_F_MOF;
   1806}
   1807
   1808static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
   1809{
   1810	enum zone_type dynamic_policy_zone = policy_zone;
   1811
   1812	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
   1813
   1814	/*
   1815	 * if policy->nodes has movable memory only,
   1816	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
   1817	 *
   1818	 * policy->nodes is intersect with node_states[N_MEMORY].
   1819	 * so if the following test fails, it implies
   1820	 * policy->nodes has movable memory only.
   1821	 */
   1822	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
   1823		dynamic_policy_zone = ZONE_MOVABLE;
   1824
   1825	return zone >= dynamic_policy_zone;
   1826}
   1827
   1828/*
   1829 * Return a nodemask representing a mempolicy for filtering nodes for
   1830 * page allocation
   1831 */
   1832nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
   1833{
   1834	int mode = policy->mode;
   1835
   1836	/* Lower zones don't get a nodemask applied for MPOL_BIND */
   1837	if (unlikely(mode == MPOL_BIND) &&
   1838		apply_policy_zone(policy, gfp_zone(gfp)) &&
   1839		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
   1840		return &policy->nodes;
   1841
   1842	if (mode == MPOL_PREFERRED_MANY)
   1843		return &policy->nodes;
   1844
   1845	return NULL;
   1846}
   1847
   1848/*
   1849 * Return the  preferred node id for 'prefer' mempolicy, and return
   1850 * the given id for all other policies.
   1851 *
   1852 * policy_node() is always coupled with policy_nodemask(), which
   1853 * secures the nodemask limit for 'bind' and 'prefer-many' policy.
   1854 */
   1855static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
   1856{
   1857	if (policy->mode == MPOL_PREFERRED) {
   1858		nd = first_node(policy->nodes);
   1859	} else {
   1860		/*
   1861		 * __GFP_THISNODE shouldn't even be used with the bind policy
   1862		 * because we might easily break the expectation to stay on the
   1863		 * requested node and not break the policy.
   1864		 */
   1865		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
   1866	}
   1867
   1868	if ((policy->mode == MPOL_BIND ||
   1869	     policy->mode == MPOL_PREFERRED_MANY) &&
   1870	    policy->home_node != NUMA_NO_NODE)
   1871		return policy->home_node;
   1872
   1873	return nd;
   1874}
   1875
   1876/* Do dynamic interleaving for a process */
   1877static unsigned interleave_nodes(struct mempolicy *policy)
   1878{
   1879	unsigned next;
   1880	struct task_struct *me = current;
   1881
   1882	next = next_node_in(me->il_prev, policy->nodes);
   1883	if (next < MAX_NUMNODES)
   1884		me->il_prev = next;
   1885	return next;
   1886}
   1887
   1888/*
   1889 * Depending on the memory policy provide a node from which to allocate the
   1890 * next slab entry.
   1891 */
   1892unsigned int mempolicy_slab_node(void)
   1893{
   1894	struct mempolicy *policy;
   1895	int node = numa_mem_id();
   1896
   1897	if (!in_task())
   1898		return node;
   1899
   1900	policy = current->mempolicy;
   1901	if (!policy)
   1902		return node;
   1903
   1904	switch (policy->mode) {
   1905	case MPOL_PREFERRED:
   1906		return first_node(policy->nodes);
   1907
   1908	case MPOL_INTERLEAVE:
   1909		return interleave_nodes(policy);
   1910
   1911	case MPOL_BIND:
   1912	case MPOL_PREFERRED_MANY:
   1913	{
   1914		struct zoneref *z;
   1915
   1916		/*
   1917		 * Follow bind policy behavior and start allocation at the
   1918		 * first node.
   1919		 */
   1920		struct zonelist *zonelist;
   1921		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
   1922		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
   1923		z = first_zones_zonelist(zonelist, highest_zoneidx,
   1924							&policy->nodes);
   1925		return z->zone ? zone_to_nid(z->zone) : node;
   1926	}
   1927	case MPOL_LOCAL:
   1928		return node;
   1929
   1930	default:
   1931		BUG();
   1932	}
   1933}
   1934
   1935/*
   1936 * Do static interleaving for a VMA with known offset @n.  Returns the n'th
   1937 * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
   1938 * number of present nodes.
   1939 */
   1940static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
   1941{
   1942	nodemask_t nodemask = pol->nodes;
   1943	unsigned int target, nnodes;
   1944	int i;
   1945	int nid;
   1946	/*
   1947	 * The barrier will stabilize the nodemask in a register or on
   1948	 * the stack so that it will stop changing under the code.
   1949	 *
   1950	 * Between first_node() and next_node(), pol->nodes could be changed
   1951	 * by other threads. So we put pol->nodes in a local stack.
   1952	 */
   1953	barrier();
   1954
   1955	nnodes = nodes_weight(nodemask);
   1956	if (!nnodes)
   1957		return numa_node_id();
   1958	target = (unsigned int)n % nnodes;
   1959	nid = first_node(nodemask);
   1960	for (i = 0; i < target; i++)
   1961		nid = next_node(nid, nodemask);
   1962	return nid;
   1963}
   1964
   1965/* Determine a node number for interleave */
   1966static inline unsigned interleave_nid(struct mempolicy *pol,
   1967		 struct vm_area_struct *vma, unsigned long addr, int shift)
   1968{
   1969	if (vma) {
   1970		unsigned long off;
   1971
   1972		/*
   1973		 * for small pages, there is no difference between
   1974		 * shift and PAGE_SHIFT, so the bit-shift is safe.
   1975		 * for huge pages, since vm_pgoff is in units of small
   1976		 * pages, we need to shift off the always 0 bits to get
   1977		 * a useful offset.
   1978		 */
   1979		BUG_ON(shift < PAGE_SHIFT);
   1980		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
   1981		off += (addr - vma->vm_start) >> shift;
   1982		return offset_il_node(pol, off);
   1983	} else
   1984		return interleave_nodes(pol);
   1985}
   1986
   1987#ifdef CONFIG_HUGETLBFS
   1988/*
   1989 * huge_node(@vma, @addr, @gfp_flags, @mpol)
   1990 * @vma: virtual memory area whose policy is sought
   1991 * @addr: address in @vma for shared policy lookup and interleave policy
   1992 * @gfp_flags: for requested zone
   1993 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
   1994 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
   1995 *
   1996 * Returns a nid suitable for a huge page allocation and a pointer
   1997 * to the struct mempolicy for conditional unref after allocation.
   1998 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
   1999 * to the mempolicy's @nodemask for filtering the zonelist.
   2000 *
   2001 * Must be protected by read_mems_allowed_begin()
   2002 */
   2003int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
   2004				struct mempolicy **mpol, nodemask_t **nodemask)
   2005{
   2006	int nid;
   2007	int mode;
   2008
   2009	*mpol = get_vma_policy(vma, addr);
   2010	*nodemask = NULL;
   2011	mode = (*mpol)->mode;
   2012
   2013	if (unlikely(mode == MPOL_INTERLEAVE)) {
   2014		nid = interleave_nid(*mpol, vma, addr,
   2015					huge_page_shift(hstate_vma(vma)));
   2016	} else {
   2017		nid = policy_node(gfp_flags, *mpol, numa_node_id());
   2018		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
   2019			*nodemask = &(*mpol)->nodes;
   2020	}
   2021	return nid;
   2022}
   2023
   2024/*
   2025 * init_nodemask_of_mempolicy
   2026 *
   2027 * If the current task's mempolicy is "default" [NULL], return 'false'
   2028 * to indicate default policy.  Otherwise, extract the policy nodemask
   2029 * for 'bind' or 'interleave' policy into the argument nodemask, or
   2030 * initialize the argument nodemask to contain the single node for
   2031 * 'preferred' or 'local' policy and return 'true' to indicate presence
   2032 * of non-default mempolicy.
   2033 *
   2034 * We don't bother with reference counting the mempolicy [mpol_get/put]
   2035 * because the current task is examining it's own mempolicy and a task's
   2036 * mempolicy is only ever changed by the task itself.
   2037 *
   2038 * N.B., it is the caller's responsibility to free a returned nodemask.
   2039 */
   2040bool init_nodemask_of_mempolicy(nodemask_t *mask)
   2041{
   2042	struct mempolicy *mempolicy;
   2043
   2044	if (!(mask && current->mempolicy))
   2045		return false;
   2046
   2047	task_lock(current);
   2048	mempolicy = current->mempolicy;
   2049	switch (mempolicy->mode) {
   2050	case MPOL_PREFERRED:
   2051	case MPOL_PREFERRED_MANY:
   2052	case MPOL_BIND:
   2053	case MPOL_INTERLEAVE:
   2054		*mask = mempolicy->nodes;
   2055		break;
   2056
   2057	case MPOL_LOCAL:
   2058		init_nodemask_of_node(mask, numa_node_id());
   2059		break;
   2060
   2061	default:
   2062		BUG();
   2063	}
   2064	task_unlock(current);
   2065
   2066	return true;
   2067}
   2068#endif
   2069
   2070/*
   2071 * mempolicy_in_oom_domain
   2072 *
   2073 * If tsk's mempolicy is "bind", check for intersection between mask and
   2074 * the policy nodemask. Otherwise, return true for all other policies
   2075 * including "interleave", as a tsk with "interleave" policy may have
   2076 * memory allocated from all nodes in system.
   2077 *
   2078 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
   2079 */
   2080bool mempolicy_in_oom_domain(struct task_struct *tsk,
   2081					const nodemask_t *mask)
   2082{
   2083	struct mempolicy *mempolicy;
   2084	bool ret = true;
   2085
   2086	if (!mask)
   2087		return ret;
   2088
   2089	task_lock(tsk);
   2090	mempolicy = tsk->mempolicy;
   2091	if (mempolicy && mempolicy->mode == MPOL_BIND)
   2092		ret = nodes_intersects(mempolicy->nodes, *mask);
   2093	task_unlock(tsk);
   2094
   2095	return ret;
   2096}
   2097
   2098/* Allocate a page in interleaved policy.
   2099   Own path because it needs to do special accounting. */
   2100static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
   2101					unsigned nid)
   2102{
   2103	struct page *page;
   2104
   2105	page = __alloc_pages(gfp, order, nid, NULL);
   2106	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
   2107	if (!static_branch_likely(&vm_numa_stat_key))
   2108		return page;
   2109	if (page && page_to_nid(page) == nid) {
   2110		preempt_disable();
   2111		__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
   2112		preempt_enable();
   2113	}
   2114	return page;
   2115}
   2116
   2117static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
   2118						int nid, struct mempolicy *pol)
   2119{
   2120	struct page *page;
   2121	gfp_t preferred_gfp;
   2122
   2123	/*
   2124	 * This is a two pass approach. The first pass will only try the
   2125	 * preferred nodes but skip the direct reclaim and allow the
   2126	 * allocation to fail, while the second pass will try all the
   2127	 * nodes in system.
   2128	 */
   2129	preferred_gfp = gfp | __GFP_NOWARN;
   2130	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
   2131	page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
   2132	if (!page)
   2133		page = __alloc_pages(gfp, order, nid, NULL);
   2134
   2135	return page;
   2136}
   2137
   2138/**
   2139 * vma_alloc_folio - Allocate a folio for a VMA.
   2140 * @gfp: GFP flags.
   2141 * @order: Order of the folio.
   2142 * @vma: Pointer to VMA or NULL if not available.
   2143 * @addr: Virtual address of the allocation.  Must be inside @vma.
   2144 * @hugepage: For hugepages try only the preferred node if possible.
   2145 *
   2146 * Allocate a folio for a specific address in @vma, using the appropriate
   2147 * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
   2148 * of the mm_struct of the VMA to prevent it from going away.  Should be
   2149 * used for all allocations for folios that will be mapped into user space.
   2150 *
   2151 * Return: The folio on success or NULL if allocation fails.
   2152 */
   2153struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
   2154		unsigned long addr, bool hugepage)
   2155{
   2156	struct mempolicy *pol;
   2157	int node = numa_node_id();
   2158	struct folio *folio;
   2159	int preferred_nid;
   2160	nodemask_t *nmask;
   2161
   2162	pol = get_vma_policy(vma, addr);
   2163
   2164	if (pol->mode == MPOL_INTERLEAVE) {
   2165		struct page *page;
   2166		unsigned nid;
   2167
   2168		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
   2169		mpol_cond_put(pol);
   2170		gfp |= __GFP_COMP;
   2171		page = alloc_page_interleave(gfp, order, nid);
   2172		if (page && order > 1)
   2173			prep_transhuge_page(page);
   2174		folio = (struct folio *)page;
   2175		goto out;
   2176	}
   2177
   2178	if (pol->mode == MPOL_PREFERRED_MANY) {
   2179		struct page *page;
   2180
   2181		node = policy_node(gfp, pol, node);
   2182		gfp |= __GFP_COMP;
   2183		page = alloc_pages_preferred_many(gfp, order, node, pol);
   2184		mpol_cond_put(pol);
   2185		if (page && order > 1)
   2186			prep_transhuge_page(page);
   2187		folio = (struct folio *)page;
   2188		goto out;
   2189	}
   2190
   2191	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
   2192		int hpage_node = node;
   2193
   2194		/*
   2195		 * For hugepage allocation and non-interleave policy which
   2196		 * allows the current node (or other explicitly preferred
   2197		 * node) we only try to allocate from the current/preferred
   2198		 * node and don't fall back to other nodes, as the cost of
   2199		 * remote accesses would likely offset THP benefits.
   2200		 *
   2201		 * If the policy is interleave or does not allow the current
   2202		 * node in its nodemask, we allocate the standard way.
   2203		 */
   2204		if (pol->mode == MPOL_PREFERRED)
   2205			hpage_node = first_node(pol->nodes);
   2206
   2207		nmask = policy_nodemask(gfp, pol);
   2208		if (!nmask || node_isset(hpage_node, *nmask)) {
   2209			mpol_cond_put(pol);
   2210			/*
   2211			 * First, try to allocate THP only on local node, but
   2212			 * don't reclaim unnecessarily, just compact.
   2213			 */
   2214			folio = __folio_alloc_node(gfp | __GFP_THISNODE |
   2215					__GFP_NORETRY, order, hpage_node);
   2216
   2217			/*
   2218			 * If hugepage allocations are configured to always
   2219			 * synchronous compact or the vma has been madvised
   2220			 * to prefer hugepage backing, retry allowing remote
   2221			 * memory with both reclaim and compact as well.
   2222			 */
   2223			if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
   2224				folio = __folio_alloc(gfp, order, hpage_node,
   2225						      nmask);
   2226
   2227			goto out;
   2228		}
   2229	}
   2230
   2231	nmask = policy_nodemask(gfp, pol);
   2232	preferred_nid = policy_node(gfp, pol, node);
   2233	folio = __folio_alloc(gfp, order, preferred_nid, nmask);
   2234	mpol_cond_put(pol);
   2235out:
   2236	return folio;
   2237}
   2238EXPORT_SYMBOL(vma_alloc_folio);
   2239
   2240/**
   2241 * alloc_pages - Allocate pages.
   2242 * @gfp: GFP flags.
   2243 * @order: Power of two of number of pages to allocate.
   2244 *
   2245 * Allocate 1 << @order contiguous pages.  The physical address of the
   2246 * first page is naturally aligned (eg an order-3 allocation will be aligned
   2247 * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
   2248 * process is honoured when in process context.
   2249 *
   2250 * Context: Can be called from any context, providing the appropriate GFP
   2251 * flags are used.
   2252 * Return: The page on success or NULL if allocation fails.
   2253 */
   2254struct page *alloc_pages(gfp_t gfp, unsigned order)
   2255{
   2256	struct mempolicy *pol = &default_policy;
   2257	struct page *page;
   2258
   2259	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
   2260		pol = get_task_policy(current);
   2261
   2262	/*
   2263	 * No reference counting needed for current->mempolicy
   2264	 * nor system default_policy
   2265	 */
   2266	if (pol->mode == MPOL_INTERLEAVE)
   2267		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
   2268	else if (pol->mode == MPOL_PREFERRED_MANY)
   2269		page = alloc_pages_preferred_many(gfp, order,
   2270				  policy_node(gfp, pol, numa_node_id()), pol);
   2271	else
   2272		page = __alloc_pages(gfp, order,
   2273				policy_node(gfp, pol, numa_node_id()),
   2274				policy_nodemask(gfp, pol));
   2275
   2276	return page;
   2277}
   2278EXPORT_SYMBOL(alloc_pages);
   2279
   2280struct folio *folio_alloc(gfp_t gfp, unsigned order)
   2281{
   2282	struct page *page = alloc_pages(gfp | __GFP_COMP, order);
   2283
   2284	if (page && order > 1)
   2285		prep_transhuge_page(page);
   2286	return (struct folio *)page;
   2287}
   2288EXPORT_SYMBOL(folio_alloc);
   2289
   2290static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
   2291		struct mempolicy *pol, unsigned long nr_pages,
   2292		struct page **page_array)
   2293{
   2294	int nodes;
   2295	unsigned long nr_pages_per_node;
   2296	int delta;
   2297	int i;
   2298	unsigned long nr_allocated;
   2299	unsigned long total_allocated = 0;
   2300
   2301	nodes = nodes_weight(pol->nodes);
   2302	nr_pages_per_node = nr_pages / nodes;
   2303	delta = nr_pages - nodes * nr_pages_per_node;
   2304
   2305	for (i = 0; i < nodes; i++) {
   2306		if (delta) {
   2307			nr_allocated = __alloc_pages_bulk(gfp,
   2308					interleave_nodes(pol), NULL,
   2309					nr_pages_per_node + 1, NULL,
   2310					page_array);
   2311			delta--;
   2312		} else {
   2313			nr_allocated = __alloc_pages_bulk(gfp,
   2314					interleave_nodes(pol), NULL,
   2315					nr_pages_per_node, NULL, page_array);
   2316		}
   2317
   2318		page_array += nr_allocated;
   2319		total_allocated += nr_allocated;
   2320	}
   2321
   2322	return total_allocated;
   2323}
   2324
   2325static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
   2326		struct mempolicy *pol, unsigned long nr_pages,
   2327		struct page **page_array)
   2328{
   2329	gfp_t preferred_gfp;
   2330	unsigned long nr_allocated = 0;
   2331
   2332	preferred_gfp = gfp | __GFP_NOWARN;
   2333	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
   2334
   2335	nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
   2336					   nr_pages, NULL, page_array);
   2337
   2338	if (nr_allocated < nr_pages)
   2339		nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
   2340				nr_pages - nr_allocated, NULL,
   2341				page_array + nr_allocated);
   2342	return nr_allocated;
   2343}
   2344
   2345/* alloc pages bulk and mempolicy should be considered at the
   2346 * same time in some situation such as vmalloc.
   2347 *
   2348 * It can accelerate memory allocation especially interleaving
   2349 * allocate memory.
   2350 */
   2351unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
   2352		unsigned long nr_pages, struct page **page_array)
   2353{
   2354	struct mempolicy *pol = &default_policy;
   2355
   2356	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
   2357		pol = get_task_policy(current);
   2358
   2359	if (pol->mode == MPOL_INTERLEAVE)
   2360		return alloc_pages_bulk_array_interleave(gfp, pol,
   2361							 nr_pages, page_array);
   2362
   2363	if (pol->mode == MPOL_PREFERRED_MANY)
   2364		return alloc_pages_bulk_array_preferred_many(gfp,
   2365				numa_node_id(), pol, nr_pages, page_array);
   2366
   2367	return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
   2368				  policy_nodemask(gfp, pol), nr_pages, NULL,
   2369				  page_array);
   2370}
   2371
   2372int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
   2373{
   2374	struct mempolicy *pol = mpol_dup(vma_policy(src));
   2375
   2376	if (IS_ERR(pol))
   2377		return PTR_ERR(pol);
   2378	dst->vm_policy = pol;
   2379	return 0;
   2380}
   2381
   2382/*
   2383 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
   2384 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
   2385 * with the mems_allowed returned by cpuset_mems_allowed().  This
   2386 * keeps mempolicies cpuset relative after its cpuset moves.  See
   2387 * further kernel/cpuset.c update_nodemask().
   2388 *
   2389 * current's mempolicy may be rebinded by the other task(the task that changes
   2390 * cpuset's mems), so we needn't do rebind work for current task.
   2391 */
   2392
   2393/* Slow path of a mempolicy duplicate */
   2394struct mempolicy *__mpol_dup(struct mempolicy *old)
   2395{
   2396	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
   2397
   2398	if (!new)
   2399		return ERR_PTR(-ENOMEM);
   2400
   2401	/* task's mempolicy is protected by alloc_lock */
   2402	if (old == current->mempolicy) {
   2403		task_lock(current);
   2404		*new = *old;
   2405		task_unlock(current);
   2406	} else
   2407		*new = *old;
   2408
   2409	if (current_cpuset_is_being_rebound()) {
   2410		nodemask_t mems = cpuset_mems_allowed(current);
   2411		mpol_rebind_policy(new, &mems);
   2412	}
   2413	atomic_set(&new->refcnt, 1);
   2414	return new;
   2415}
   2416
   2417/* Slow path of a mempolicy comparison */
   2418bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
   2419{
   2420	if (!a || !b)
   2421		return false;
   2422	if (a->mode != b->mode)
   2423		return false;
   2424	if (a->flags != b->flags)
   2425		return false;
   2426	if (a->home_node != b->home_node)
   2427		return false;
   2428	if (mpol_store_user_nodemask(a))
   2429		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
   2430			return false;
   2431
   2432	switch (a->mode) {
   2433	case MPOL_BIND:
   2434	case MPOL_INTERLEAVE:
   2435	case MPOL_PREFERRED:
   2436	case MPOL_PREFERRED_MANY:
   2437		return !!nodes_equal(a->nodes, b->nodes);
   2438	case MPOL_LOCAL:
   2439		return true;
   2440	default:
   2441		BUG();
   2442		return false;
   2443	}
   2444}
   2445
   2446/*
   2447 * Shared memory backing store policy support.
   2448 *
   2449 * Remember policies even when nobody has shared memory mapped.
   2450 * The policies are kept in Red-Black tree linked from the inode.
   2451 * They are protected by the sp->lock rwlock, which should be held
   2452 * for any accesses to the tree.
   2453 */
   2454
   2455/*
   2456 * lookup first element intersecting start-end.  Caller holds sp->lock for
   2457 * reading or for writing
   2458 */
   2459static struct sp_node *
   2460sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
   2461{
   2462	struct rb_node *n = sp->root.rb_node;
   2463
   2464	while (n) {
   2465		struct sp_node *p = rb_entry(n, struct sp_node, nd);
   2466
   2467		if (start >= p->end)
   2468			n = n->rb_right;
   2469		else if (end <= p->start)
   2470			n = n->rb_left;
   2471		else
   2472			break;
   2473	}
   2474	if (!n)
   2475		return NULL;
   2476	for (;;) {
   2477		struct sp_node *w = NULL;
   2478		struct rb_node *prev = rb_prev(n);
   2479		if (!prev)
   2480			break;
   2481		w = rb_entry(prev, struct sp_node, nd);
   2482		if (w->end <= start)
   2483			break;
   2484		n = prev;
   2485	}
   2486	return rb_entry(n, struct sp_node, nd);
   2487}
   2488
   2489/*
   2490 * Insert a new shared policy into the list.  Caller holds sp->lock for
   2491 * writing.
   2492 */
   2493static void sp_insert(struct shared_policy *sp, struct sp_node *new)
   2494{
   2495	struct rb_node **p = &sp->root.rb_node;
   2496	struct rb_node *parent = NULL;
   2497	struct sp_node *nd;
   2498
   2499	while (*p) {
   2500		parent = *p;
   2501		nd = rb_entry(parent, struct sp_node, nd);
   2502		if (new->start < nd->start)
   2503			p = &(*p)->rb_left;
   2504		else if (new->end > nd->end)
   2505			p = &(*p)->rb_right;
   2506		else
   2507			BUG();
   2508	}
   2509	rb_link_node(&new->nd, parent, p);
   2510	rb_insert_color(&new->nd, &sp->root);
   2511	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
   2512		 new->policy ? new->policy->mode : 0);
   2513}
   2514
   2515/* Find shared policy intersecting idx */
   2516struct mempolicy *
   2517mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
   2518{
   2519	struct mempolicy *pol = NULL;
   2520	struct sp_node *sn;
   2521
   2522	if (!sp->root.rb_node)
   2523		return NULL;
   2524	read_lock(&sp->lock);
   2525	sn = sp_lookup(sp, idx, idx+1);
   2526	if (sn) {
   2527		mpol_get(sn->policy);
   2528		pol = sn->policy;
   2529	}
   2530	read_unlock(&sp->lock);
   2531	return pol;
   2532}
   2533
   2534static void sp_free(struct sp_node *n)
   2535{
   2536	mpol_put(n->policy);
   2537	kmem_cache_free(sn_cache, n);
   2538}
   2539
   2540/**
   2541 * mpol_misplaced - check whether current page node is valid in policy
   2542 *
   2543 * @page: page to be checked
   2544 * @vma: vm area where page mapped
   2545 * @addr: virtual address where page mapped
   2546 *
   2547 * Lookup current policy node id for vma,addr and "compare to" page's
   2548 * node id.  Policy determination "mimics" alloc_page_vma().
   2549 * Called from fault path where we know the vma and faulting address.
   2550 *
   2551 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
   2552 * policy, or a suitable node ID to allocate a replacement page from.
   2553 */
   2554int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
   2555{
   2556	struct mempolicy *pol;
   2557	struct zoneref *z;
   2558	int curnid = page_to_nid(page);
   2559	unsigned long pgoff;
   2560	int thiscpu = raw_smp_processor_id();
   2561	int thisnid = cpu_to_node(thiscpu);
   2562	int polnid = NUMA_NO_NODE;
   2563	int ret = NUMA_NO_NODE;
   2564
   2565	pol = get_vma_policy(vma, addr);
   2566	if (!(pol->flags & MPOL_F_MOF))
   2567		goto out;
   2568
   2569	switch (pol->mode) {
   2570	case MPOL_INTERLEAVE:
   2571		pgoff = vma->vm_pgoff;
   2572		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
   2573		polnid = offset_il_node(pol, pgoff);
   2574		break;
   2575
   2576	case MPOL_PREFERRED:
   2577		if (node_isset(curnid, pol->nodes))
   2578			goto out;
   2579		polnid = first_node(pol->nodes);
   2580		break;
   2581
   2582	case MPOL_LOCAL:
   2583		polnid = numa_node_id();
   2584		break;
   2585
   2586	case MPOL_BIND:
   2587		/* Optimize placement among multiple nodes via NUMA balancing */
   2588		if (pol->flags & MPOL_F_MORON) {
   2589			if (node_isset(thisnid, pol->nodes))
   2590				break;
   2591			goto out;
   2592		}
   2593		fallthrough;
   2594
   2595	case MPOL_PREFERRED_MANY:
   2596		/*
   2597		 * use current page if in policy nodemask,
   2598		 * else select nearest allowed node, if any.
   2599		 * If no allowed nodes, use current [!misplaced].
   2600		 */
   2601		if (node_isset(curnid, pol->nodes))
   2602			goto out;
   2603		z = first_zones_zonelist(
   2604				node_zonelist(numa_node_id(), GFP_HIGHUSER),
   2605				gfp_zone(GFP_HIGHUSER),
   2606				&pol->nodes);
   2607		polnid = zone_to_nid(z->zone);
   2608		break;
   2609
   2610	default:
   2611		BUG();
   2612	}
   2613
   2614	/* Migrate the page towards the node whose CPU is referencing it */
   2615	if (pol->flags & MPOL_F_MORON) {
   2616		polnid = thisnid;
   2617
   2618		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
   2619			goto out;
   2620	}
   2621
   2622	if (curnid != polnid)
   2623		ret = polnid;
   2624out:
   2625	mpol_cond_put(pol);
   2626
   2627	return ret;
   2628}
   2629
   2630/*
   2631 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
   2632 * dropped after task->mempolicy is set to NULL so that any allocation done as
   2633 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
   2634 * policy.
   2635 */
   2636void mpol_put_task_policy(struct task_struct *task)
   2637{
   2638	struct mempolicy *pol;
   2639
   2640	task_lock(task);
   2641	pol = task->mempolicy;
   2642	task->mempolicy = NULL;
   2643	task_unlock(task);
   2644	mpol_put(pol);
   2645}
   2646
   2647static void sp_delete(struct shared_policy *sp, struct sp_node *n)
   2648{
   2649	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
   2650	rb_erase(&n->nd, &sp->root);
   2651	sp_free(n);
   2652}
   2653
   2654static void sp_node_init(struct sp_node *node, unsigned long start,
   2655			unsigned long end, struct mempolicy *pol)
   2656{
   2657	node->start = start;
   2658	node->end = end;
   2659	node->policy = pol;
   2660}
   2661
   2662static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
   2663				struct mempolicy *pol)
   2664{
   2665	struct sp_node *n;
   2666	struct mempolicy *newpol;
   2667
   2668	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
   2669	if (!n)
   2670		return NULL;
   2671
   2672	newpol = mpol_dup(pol);
   2673	if (IS_ERR(newpol)) {
   2674		kmem_cache_free(sn_cache, n);
   2675		return NULL;
   2676	}
   2677	newpol->flags |= MPOL_F_SHARED;
   2678	sp_node_init(n, start, end, newpol);
   2679
   2680	return n;
   2681}
   2682
   2683/* Replace a policy range. */
   2684static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
   2685				 unsigned long end, struct sp_node *new)
   2686{
   2687	struct sp_node *n;
   2688	struct sp_node *n_new = NULL;
   2689	struct mempolicy *mpol_new = NULL;
   2690	int ret = 0;
   2691
   2692restart:
   2693	write_lock(&sp->lock);
   2694	n = sp_lookup(sp, start, end);
   2695	/* Take care of old policies in the same range. */
   2696	while (n && n->start < end) {
   2697		struct rb_node *next = rb_next(&n->nd);
   2698		if (n->start >= start) {
   2699			if (n->end <= end)
   2700				sp_delete(sp, n);
   2701			else
   2702				n->start = end;
   2703		} else {
   2704			/* Old policy spanning whole new range. */
   2705			if (n->end > end) {
   2706				if (!n_new)
   2707					goto alloc_new;
   2708
   2709				*mpol_new = *n->policy;
   2710				atomic_set(&mpol_new->refcnt, 1);
   2711				sp_node_init(n_new, end, n->end, mpol_new);
   2712				n->end = start;
   2713				sp_insert(sp, n_new);
   2714				n_new = NULL;
   2715				mpol_new = NULL;
   2716				break;
   2717			} else
   2718				n->end = start;
   2719		}
   2720		if (!next)
   2721			break;
   2722		n = rb_entry(next, struct sp_node, nd);
   2723	}
   2724	if (new)
   2725		sp_insert(sp, new);
   2726	write_unlock(&sp->lock);
   2727	ret = 0;
   2728
   2729err_out:
   2730	if (mpol_new)
   2731		mpol_put(mpol_new);
   2732	if (n_new)
   2733		kmem_cache_free(sn_cache, n_new);
   2734
   2735	return ret;
   2736
   2737alloc_new:
   2738	write_unlock(&sp->lock);
   2739	ret = -ENOMEM;
   2740	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
   2741	if (!n_new)
   2742		goto err_out;
   2743	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
   2744	if (!mpol_new)
   2745		goto err_out;
   2746	atomic_set(&mpol_new->refcnt, 1);
   2747	goto restart;
   2748}
   2749
   2750/**
   2751 * mpol_shared_policy_init - initialize shared policy for inode
   2752 * @sp: pointer to inode shared policy
   2753 * @mpol:  struct mempolicy to install
   2754 *
   2755 * Install non-NULL @mpol in inode's shared policy rb-tree.
   2756 * On entry, the current task has a reference on a non-NULL @mpol.
   2757 * This must be released on exit.
   2758 * This is called at get_inode() calls and we can use GFP_KERNEL.
   2759 */
   2760void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
   2761{
   2762	int ret;
   2763
   2764	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
   2765	rwlock_init(&sp->lock);
   2766
   2767	if (mpol) {
   2768		struct vm_area_struct pvma;
   2769		struct mempolicy *new;
   2770		NODEMASK_SCRATCH(scratch);
   2771
   2772		if (!scratch)
   2773			goto put_mpol;
   2774		/* contextualize the tmpfs mount point mempolicy */
   2775		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
   2776		if (IS_ERR(new))
   2777			goto free_scratch; /* no valid nodemask intersection */
   2778
   2779		task_lock(current);
   2780		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
   2781		task_unlock(current);
   2782		if (ret)
   2783			goto put_new;
   2784
   2785		/* Create pseudo-vma that contains just the policy */
   2786		vma_init(&pvma, NULL);
   2787		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
   2788		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
   2789
   2790put_new:
   2791		mpol_put(new);			/* drop initial ref */
   2792free_scratch:
   2793		NODEMASK_SCRATCH_FREE(scratch);
   2794put_mpol:
   2795		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
   2796	}
   2797}
   2798
   2799int mpol_set_shared_policy(struct shared_policy *info,
   2800			struct vm_area_struct *vma, struct mempolicy *npol)
   2801{
   2802	int err;
   2803	struct sp_node *new = NULL;
   2804	unsigned long sz = vma_pages(vma);
   2805
   2806	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
   2807		 vma->vm_pgoff,
   2808		 sz, npol ? npol->mode : -1,
   2809		 npol ? npol->flags : -1,
   2810		 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
   2811
   2812	if (npol) {
   2813		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
   2814		if (!new)
   2815			return -ENOMEM;
   2816	}
   2817	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
   2818	if (err && new)
   2819		sp_free(new);
   2820	return err;
   2821}
   2822
   2823/* Free a backing policy store on inode delete. */
   2824void mpol_free_shared_policy(struct shared_policy *p)
   2825{
   2826	struct sp_node *n;
   2827	struct rb_node *next;
   2828
   2829	if (!p->root.rb_node)
   2830		return;
   2831	write_lock(&p->lock);
   2832	next = rb_first(&p->root);
   2833	while (next) {
   2834		n = rb_entry(next, struct sp_node, nd);
   2835		next = rb_next(&n->nd);
   2836		sp_delete(p, n);
   2837	}
   2838	write_unlock(&p->lock);
   2839}
   2840
   2841#ifdef CONFIG_NUMA_BALANCING
   2842static int __initdata numabalancing_override;
   2843
   2844static void __init check_numabalancing_enable(void)
   2845{
   2846	bool numabalancing_default = false;
   2847
   2848	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
   2849		numabalancing_default = true;
   2850
   2851	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
   2852	if (numabalancing_override)
   2853		set_numabalancing_state(numabalancing_override == 1);
   2854
   2855	if (num_online_nodes() > 1 && !numabalancing_override) {
   2856		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
   2857			numabalancing_default ? "Enabling" : "Disabling");
   2858		set_numabalancing_state(numabalancing_default);
   2859	}
   2860}
   2861
   2862static int __init setup_numabalancing(char *str)
   2863{
   2864	int ret = 0;
   2865	if (!str)
   2866		goto out;
   2867
   2868	if (!strcmp(str, "enable")) {
   2869		numabalancing_override = 1;
   2870		ret = 1;
   2871	} else if (!strcmp(str, "disable")) {
   2872		numabalancing_override = -1;
   2873		ret = 1;
   2874	}
   2875out:
   2876	if (!ret)
   2877		pr_warn("Unable to parse numa_balancing=\n");
   2878
   2879	return ret;
   2880}
   2881__setup("numa_balancing=", setup_numabalancing);
   2882#else
   2883static inline void __init check_numabalancing_enable(void)
   2884{
   2885}
   2886#endif /* CONFIG_NUMA_BALANCING */
   2887
   2888/* assumes fs == KERNEL_DS */
   2889void __init numa_policy_init(void)
   2890{
   2891	nodemask_t interleave_nodes;
   2892	unsigned long largest = 0;
   2893	int nid, prefer = 0;
   2894
   2895	policy_cache = kmem_cache_create("numa_policy",
   2896					 sizeof(struct mempolicy),
   2897					 0, SLAB_PANIC, NULL);
   2898
   2899	sn_cache = kmem_cache_create("shared_policy_node",
   2900				     sizeof(struct sp_node),
   2901				     0, SLAB_PANIC, NULL);
   2902
   2903	for_each_node(nid) {
   2904		preferred_node_policy[nid] = (struct mempolicy) {
   2905			.refcnt = ATOMIC_INIT(1),
   2906			.mode = MPOL_PREFERRED,
   2907			.flags = MPOL_F_MOF | MPOL_F_MORON,
   2908			.nodes = nodemask_of_node(nid),
   2909		};
   2910	}
   2911
   2912	/*
   2913	 * Set interleaving policy for system init. Interleaving is only
   2914	 * enabled across suitably sized nodes (default is >= 16MB), or
   2915	 * fall back to the largest node if they're all smaller.
   2916	 */
   2917	nodes_clear(interleave_nodes);
   2918	for_each_node_state(nid, N_MEMORY) {
   2919		unsigned long total_pages = node_present_pages(nid);
   2920
   2921		/* Preserve the largest node */
   2922		if (largest < total_pages) {
   2923			largest = total_pages;
   2924			prefer = nid;
   2925		}
   2926
   2927		/* Interleave this node? */
   2928		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
   2929			node_set(nid, interleave_nodes);
   2930	}
   2931
   2932	/* All too small, use the largest */
   2933	if (unlikely(nodes_empty(interleave_nodes)))
   2934		node_set(prefer, interleave_nodes);
   2935
   2936	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
   2937		pr_err("%s: interleaving failed\n", __func__);
   2938
   2939	check_numabalancing_enable();
   2940}
   2941
   2942/* Reset policy of current process to default */
   2943void numa_default_policy(void)
   2944{
   2945	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
   2946}
   2947
   2948/*
   2949 * Parse and format mempolicy from/to strings
   2950 */
   2951
   2952static const char * const policy_modes[] =
   2953{
   2954	[MPOL_DEFAULT]    = "default",
   2955	[MPOL_PREFERRED]  = "prefer",
   2956	[MPOL_BIND]       = "bind",
   2957	[MPOL_INTERLEAVE] = "interleave",
   2958	[MPOL_LOCAL]      = "local",
   2959	[MPOL_PREFERRED_MANY]  = "prefer (many)",
   2960};
   2961
   2962
   2963#ifdef CONFIG_TMPFS
   2964/**
   2965 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
   2966 * @str:  string containing mempolicy to parse
   2967 * @mpol:  pointer to struct mempolicy pointer, returned on success.
   2968 *
   2969 * Format of input:
   2970 *	<mode>[=<flags>][:<nodelist>]
   2971 *
   2972 * Return: %0 on success, else %1
   2973 */
   2974int mpol_parse_str(char *str, struct mempolicy **mpol)
   2975{
   2976	struct mempolicy *new = NULL;
   2977	unsigned short mode_flags;
   2978	nodemask_t nodes;
   2979	char *nodelist = strchr(str, ':');
   2980	char *flags = strchr(str, '=');
   2981	int err = 1, mode;
   2982
   2983	if (flags)
   2984		*flags++ = '\0';	/* terminate mode string */
   2985
   2986	if (nodelist) {
   2987		/* NUL-terminate mode or flags string */
   2988		*nodelist++ = '\0';
   2989		if (nodelist_parse(nodelist, nodes))
   2990			goto out;
   2991		if (!nodes_subset(nodes, node_states[N_MEMORY]))
   2992			goto out;
   2993	} else
   2994		nodes_clear(nodes);
   2995
   2996	mode = match_string(policy_modes, MPOL_MAX, str);
   2997	if (mode < 0)
   2998		goto out;
   2999
   3000	switch (mode) {
   3001	case MPOL_PREFERRED:
   3002		/*
   3003		 * Insist on a nodelist of one node only, although later
   3004		 * we use first_node(nodes) to grab a single node, so here
   3005		 * nodelist (or nodes) cannot be empty.
   3006		 */
   3007		if (nodelist) {
   3008			char *rest = nodelist;
   3009			while (isdigit(*rest))
   3010				rest++;
   3011			if (*rest)
   3012				goto out;
   3013			if (nodes_empty(nodes))
   3014				goto out;
   3015		}
   3016		break;
   3017	case MPOL_INTERLEAVE:
   3018		/*
   3019		 * Default to online nodes with memory if no nodelist
   3020		 */
   3021		if (!nodelist)
   3022			nodes = node_states[N_MEMORY];
   3023		break;
   3024	case MPOL_LOCAL:
   3025		/*
   3026		 * Don't allow a nodelist;  mpol_new() checks flags
   3027		 */
   3028		if (nodelist)
   3029			goto out;
   3030		break;
   3031	case MPOL_DEFAULT:
   3032		/*
   3033		 * Insist on a empty nodelist
   3034		 */
   3035		if (!nodelist)
   3036			err = 0;
   3037		goto out;
   3038	case MPOL_PREFERRED_MANY:
   3039	case MPOL_BIND:
   3040		/*
   3041		 * Insist on a nodelist
   3042		 */
   3043		if (!nodelist)
   3044			goto out;
   3045	}
   3046
   3047	mode_flags = 0;
   3048	if (flags) {
   3049		/*
   3050		 * Currently, we only support two mutually exclusive
   3051		 * mode flags.
   3052		 */
   3053		if (!strcmp(flags, "static"))
   3054			mode_flags |= MPOL_F_STATIC_NODES;
   3055		else if (!strcmp(flags, "relative"))
   3056			mode_flags |= MPOL_F_RELATIVE_NODES;
   3057		else
   3058			goto out;
   3059	}
   3060
   3061	new = mpol_new(mode, mode_flags, &nodes);
   3062	if (IS_ERR(new))
   3063		goto out;
   3064
   3065	/*
   3066	 * Save nodes for mpol_to_str() to show the tmpfs mount options
   3067	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
   3068	 */
   3069	if (mode != MPOL_PREFERRED) {
   3070		new->nodes = nodes;
   3071	} else if (nodelist) {
   3072		nodes_clear(new->nodes);
   3073		node_set(first_node(nodes), new->nodes);
   3074	} else {
   3075		new->mode = MPOL_LOCAL;
   3076	}
   3077
   3078	/*
   3079	 * Save nodes for contextualization: this will be used to "clone"
   3080	 * the mempolicy in a specific context [cpuset] at a later time.
   3081	 */
   3082	new->w.user_nodemask = nodes;
   3083
   3084	err = 0;
   3085
   3086out:
   3087	/* Restore string for error message */
   3088	if (nodelist)
   3089		*--nodelist = ':';
   3090	if (flags)
   3091		*--flags = '=';
   3092	if (!err)
   3093		*mpol = new;
   3094	return err;
   3095}
   3096#endif /* CONFIG_TMPFS */
   3097
   3098/**
   3099 * mpol_to_str - format a mempolicy structure for printing
   3100 * @buffer:  to contain formatted mempolicy string
   3101 * @maxlen:  length of @buffer
   3102 * @pol:  pointer to mempolicy to be formatted
   3103 *
   3104 * Convert @pol into a string.  If @buffer is too short, truncate the string.
   3105 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
   3106 * longest flag, "relative", and to display at least a few node ids.
   3107 */
   3108void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
   3109{
   3110	char *p = buffer;
   3111	nodemask_t nodes = NODE_MASK_NONE;
   3112	unsigned short mode = MPOL_DEFAULT;
   3113	unsigned short flags = 0;
   3114
   3115	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
   3116		mode = pol->mode;
   3117		flags = pol->flags;
   3118	}
   3119
   3120	switch (mode) {
   3121	case MPOL_DEFAULT:
   3122	case MPOL_LOCAL:
   3123		break;
   3124	case MPOL_PREFERRED:
   3125	case MPOL_PREFERRED_MANY:
   3126	case MPOL_BIND:
   3127	case MPOL_INTERLEAVE:
   3128		nodes = pol->nodes;
   3129		break;
   3130	default:
   3131		WARN_ON_ONCE(1);
   3132		snprintf(p, maxlen, "unknown");
   3133		return;
   3134	}
   3135
   3136	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
   3137
   3138	if (flags & MPOL_MODE_FLAGS) {
   3139		p += snprintf(p, buffer + maxlen - p, "=");
   3140
   3141		/*
   3142		 * Currently, the only defined flags are mutually exclusive
   3143		 */
   3144		if (flags & MPOL_F_STATIC_NODES)
   3145			p += snprintf(p, buffer + maxlen - p, "static");
   3146		else if (flags & MPOL_F_RELATIVE_NODES)
   3147			p += snprintf(p, buffer + maxlen - p, "relative");
   3148	}
   3149
   3150	if (!nodes_empty(nodes))
   3151		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
   3152			       nodemask_pr_args(&nodes));
   3153}