cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mlock.c (19461B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *	linux/mm/mlock.c
      4 *
      5 *  (C) Copyright 1995 Linus Torvalds
      6 *  (C) Copyright 2002 Christoph Hellwig
      7 */
      8
      9#include <linux/capability.h>
     10#include <linux/mman.h>
     11#include <linux/mm.h>
     12#include <linux/sched/user.h>
     13#include <linux/swap.h>
     14#include <linux/swapops.h>
     15#include <linux/pagemap.h>
     16#include <linux/pagevec.h>
     17#include <linux/pagewalk.h>
     18#include <linux/mempolicy.h>
     19#include <linux/syscalls.h>
     20#include <linux/sched.h>
     21#include <linux/export.h>
     22#include <linux/rmap.h>
     23#include <linux/mmzone.h>
     24#include <linux/hugetlb.h>
     25#include <linux/memcontrol.h>
     26#include <linux/mm_inline.h>
     27#include <linux/secretmem.h>
     28
     29#include "internal.h"
     30
     31struct mlock_pvec {
     32	local_lock_t lock;
     33	struct pagevec vec;
     34};
     35
     36static DEFINE_PER_CPU(struct mlock_pvec, mlock_pvec) = {
     37	.lock = INIT_LOCAL_LOCK(lock),
     38};
     39
     40bool can_do_mlock(void)
     41{
     42	if (rlimit(RLIMIT_MEMLOCK) != 0)
     43		return true;
     44	if (capable(CAP_IPC_LOCK))
     45		return true;
     46	return false;
     47}
     48EXPORT_SYMBOL(can_do_mlock);
     49
     50/*
     51 * Mlocked pages are marked with PageMlocked() flag for efficient testing
     52 * in vmscan and, possibly, the fault path; and to support semi-accurate
     53 * statistics.
     54 *
     55 * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
     56 * be placed on the LRU "unevictable" list, rather than the [in]active lists.
     57 * The unevictable list is an LRU sibling list to the [in]active lists.
     58 * PageUnevictable is set to indicate the unevictable state.
     59 */
     60
     61static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec)
     62{
     63	/* There is nothing more we can do while it's off LRU */
     64	if (!TestClearPageLRU(page))
     65		return lruvec;
     66
     67	lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
     68
     69	if (unlikely(page_evictable(page))) {
     70		/*
     71		 * This is a little surprising, but quite possible:
     72		 * PageMlocked must have got cleared already by another CPU.
     73		 * Could this page be on the Unevictable LRU?  I'm not sure,
     74		 * but move it now if so.
     75		 */
     76		if (PageUnevictable(page)) {
     77			del_page_from_lru_list(page, lruvec);
     78			ClearPageUnevictable(page);
     79			add_page_to_lru_list(page, lruvec);
     80			__count_vm_events(UNEVICTABLE_PGRESCUED,
     81					  thp_nr_pages(page));
     82		}
     83		goto out;
     84	}
     85
     86	if (PageUnevictable(page)) {
     87		if (PageMlocked(page))
     88			page->mlock_count++;
     89		goto out;
     90	}
     91
     92	del_page_from_lru_list(page, lruvec);
     93	ClearPageActive(page);
     94	SetPageUnevictable(page);
     95	page->mlock_count = !!PageMlocked(page);
     96	add_page_to_lru_list(page, lruvec);
     97	__count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
     98out:
     99	SetPageLRU(page);
    100	return lruvec;
    101}
    102
    103static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec)
    104{
    105	VM_BUG_ON_PAGE(PageLRU(page), page);
    106
    107	lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
    108
    109	/* As above, this is a little surprising, but possible */
    110	if (unlikely(page_evictable(page)))
    111		goto out;
    112
    113	SetPageUnevictable(page);
    114	page->mlock_count = !!PageMlocked(page);
    115	__count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
    116out:
    117	add_page_to_lru_list(page, lruvec);
    118	SetPageLRU(page);
    119	return lruvec;
    120}
    121
    122static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec)
    123{
    124	int nr_pages = thp_nr_pages(page);
    125	bool isolated = false;
    126
    127	if (!TestClearPageLRU(page))
    128		goto munlock;
    129
    130	isolated = true;
    131	lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
    132
    133	if (PageUnevictable(page)) {
    134		/* Then mlock_count is maintained, but might undercount */
    135		if (page->mlock_count)
    136			page->mlock_count--;
    137		if (page->mlock_count)
    138			goto out;
    139	}
    140	/* else assume that was the last mlock: reclaim will fix it if not */
    141
    142munlock:
    143	if (TestClearPageMlocked(page)) {
    144		__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
    145		if (isolated || !PageUnevictable(page))
    146			__count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
    147		else
    148			__count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
    149	}
    150
    151	/* page_evictable() has to be checked *after* clearing Mlocked */
    152	if (isolated && PageUnevictable(page) && page_evictable(page)) {
    153		del_page_from_lru_list(page, lruvec);
    154		ClearPageUnevictable(page);
    155		add_page_to_lru_list(page, lruvec);
    156		__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
    157	}
    158out:
    159	if (isolated)
    160		SetPageLRU(page);
    161	return lruvec;
    162}
    163
    164/*
    165 * Flags held in the low bits of a struct page pointer on the mlock_pvec.
    166 */
    167#define LRU_PAGE 0x1
    168#define NEW_PAGE 0x2
    169static inline struct page *mlock_lru(struct page *page)
    170{
    171	return (struct page *)((unsigned long)page + LRU_PAGE);
    172}
    173
    174static inline struct page *mlock_new(struct page *page)
    175{
    176	return (struct page *)((unsigned long)page + NEW_PAGE);
    177}
    178
    179/*
    180 * mlock_pagevec() is derived from pagevec_lru_move_fn():
    181 * perhaps that can make use of such page pointer flags in future,
    182 * but for now just keep it for mlock.  We could use three separate
    183 * pagevecs instead, but one feels better (munlocking a full pagevec
    184 * does not need to drain mlocking pagevecs first).
    185 */
    186static void mlock_pagevec(struct pagevec *pvec)
    187{
    188	struct lruvec *lruvec = NULL;
    189	unsigned long mlock;
    190	struct page *page;
    191	int i;
    192
    193	for (i = 0; i < pagevec_count(pvec); i++) {
    194		page = pvec->pages[i];
    195		mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE);
    196		page = (struct page *)((unsigned long)page - mlock);
    197		pvec->pages[i] = page;
    198
    199		if (mlock & LRU_PAGE)
    200			lruvec = __mlock_page(page, lruvec);
    201		else if (mlock & NEW_PAGE)
    202			lruvec = __mlock_new_page(page, lruvec);
    203		else
    204			lruvec = __munlock_page(page, lruvec);
    205	}
    206
    207	if (lruvec)
    208		unlock_page_lruvec_irq(lruvec);
    209	release_pages(pvec->pages, pvec->nr);
    210	pagevec_reinit(pvec);
    211}
    212
    213void mlock_page_drain_local(void)
    214{
    215	struct pagevec *pvec;
    216
    217	local_lock(&mlock_pvec.lock);
    218	pvec = this_cpu_ptr(&mlock_pvec.vec);
    219	if (pagevec_count(pvec))
    220		mlock_pagevec(pvec);
    221	local_unlock(&mlock_pvec.lock);
    222}
    223
    224void mlock_page_drain_remote(int cpu)
    225{
    226	struct pagevec *pvec;
    227
    228	WARN_ON_ONCE(cpu_online(cpu));
    229	pvec = &per_cpu(mlock_pvec.vec, cpu);
    230	if (pagevec_count(pvec))
    231		mlock_pagevec(pvec);
    232}
    233
    234bool need_mlock_page_drain(int cpu)
    235{
    236	return pagevec_count(&per_cpu(mlock_pvec.vec, cpu));
    237}
    238
    239/**
    240 * mlock_folio - mlock a folio already on (or temporarily off) LRU
    241 * @folio: folio to be mlocked.
    242 */
    243void mlock_folio(struct folio *folio)
    244{
    245	struct pagevec *pvec;
    246
    247	local_lock(&mlock_pvec.lock);
    248	pvec = this_cpu_ptr(&mlock_pvec.vec);
    249
    250	if (!folio_test_set_mlocked(folio)) {
    251		int nr_pages = folio_nr_pages(folio);
    252
    253		zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
    254		__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
    255	}
    256
    257	folio_get(folio);
    258	if (!pagevec_add(pvec, mlock_lru(&folio->page)) ||
    259	    folio_test_large(folio) || lru_cache_disabled())
    260		mlock_pagevec(pvec);
    261	local_unlock(&mlock_pvec.lock);
    262}
    263
    264/**
    265 * mlock_new_page - mlock a newly allocated page not yet on LRU
    266 * @page: page to be mlocked, either a normal page or a THP head.
    267 */
    268void mlock_new_page(struct page *page)
    269{
    270	struct pagevec *pvec;
    271	int nr_pages = thp_nr_pages(page);
    272
    273	local_lock(&mlock_pvec.lock);
    274	pvec = this_cpu_ptr(&mlock_pvec.vec);
    275	SetPageMlocked(page);
    276	mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
    277	__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
    278
    279	get_page(page);
    280	if (!pagevec_add(pvec, mlock_new(page)) ||
    281	    PageHead(page) || lru_cache_disabled())
    282		mlock_pagevec(pvec);
    283	local_unlock(&mlock_pvec.lock);
    284}
    285
    286/**
    287 * munlock_page - munlock a page
    288 * @page: page to be munlocked, either a normal page or a THP head.
    289 */
    290void munlock_page(struct page *page)
    291{
    292	struct pagevec *pvec;
    293
    294	local_lock(&mlock_pvec.lock);
    295	pvec = this_cpu_ptr(&mlock_pvec.vec);
    296	/*
    297	 * TestClearPageMlocked(page) must be left to __munlock_page(),
    298	 * which will check whether the page is multiply mlocked.
    299	 */
    300
    301	get_page(page);
    302	if (!pagevec_add(pvec, page) ||
    303	    PageHead(page) || lru_cache_disabled())
    304		mlock_pagevec(pvec);
    305	local_unlock(&mlock_pvec.lock);
    306}
    307
    308static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
    309			   unsigned long end, struct mm_walk *walk)
    310
    311{
    312	struct vm_area_struct *vma = walk->vma;
    313	spinlock_t *ptl;
    314	pte_t *start_pte, *pte;
    315	struct page *page;
    316
    317	ptl = pmd_trans_huge_lock(pmd, vma);
    318	if (ptl) {
    319		if (!pmd_present(*pmd))
    320			goto out;
    321		if (is_huge_zero_pmd(*pmd))
    322			goto out;
    323		page = pmd_page(*pmd);
    324		if (vma->vm_flags & VM_LOCKED)
    325			mlock_folio(page_folio(page));
    326		else
    327			munlock_page(page);
    328		goto out;
    329	}
    330
    331	start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
    332	for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
    333		if (!pte_present(*pte))
    334			continue;
    335		page = vm_normal_page(vma, addr, *pte);
    336		if (!page)
    337			continue;
    338		if (PageTransCompound(page))
    339			continue;
    340		if (vma->vm_flags & VM_LOCKED)
    341			mlock_folio(page_folio(page));
    342		else
    343			munlock_page(page);
    344	}
    345	pte_unmap(start_pte);
    346out:
    347	spin_unlock(ptl);
    348	cond_resched();
    349	return 0;
    350}
    351
    352/*
    353 * mlock_vma_pages_range() - mlock any pages already in the range,
    354 *                           or munlock all pages in the range.
    355 * @vma - vma containing range to be mlock()ed or munlock()ed
    356 * @start - start address in @vma of the range
    357 * @end - end of range in @vma
    358 * @newflags - the new set of flags for @vma.
    359 *
    360 * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
    361 * called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
    362 */
    363static void mlock_vma_pages_range(struct vm_area_struct *vma,
    364	unsigned long start, unsigned long end, vm_flags_t newflags)
    365{
    366	static const struct mm_walk_ops mlock_walk_ops = {
    367		.pmd_entry = mlock_pte_range,
    368	};
    369
    370	/*
    371	 * There is a slight chance that concurrent page migration,
    372	 * or page reclaim finding a page of this now-VM_LOCKED vma,
    373	 * will call mlock_vma_page() and raise page's mlock_count:
    374	 * double counting, leaving the page unevictable indefinitely.
    375	 * Communicate this danger to mlock_vma_page() with VM_IO,
    376	 * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
    377	 * mmap_lock is held in write mode here, so this weird
    378	 * combination should not be visible to other mmap_lock users;
    379	 * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
    380	 */
    381	if (newflags & VM_LOCKED)
    382		newflags |= VM_IO;
    383	WRITE_ONCE(vma->vm_flags, newflags);
    384
    385	lru_add_drain();
    386	walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
    387	lru_add_drain();
    388
    389	if (newflags & VM_IO) {
    390		newflags &= ~VM_IO;
    391		WRITE_ONCE(vma->vm_flags, newflags);
    392	}
    393}
    394
    395/*
    396 * mlock_fixup  - handle mlock[all]/munlock[all] requests.
    397 *
    398 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
    399 * munlock is a no-op.  However, for some special vmas, we go ahead and
    400 * populate the ptes.
    401 *
    402 * For vmas that pass the filters, merge/split as appropriate.
    403 */
    404static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
    405	unsigned long start, unsigned long end, vm_flags_t newflags)
    406{
    407	struct mm_struct *mm = vma->vm_mm;
    408	pgoff_t pgoff;
    409	int nr_pages;
    410	int ret = 0;
    411	vm_flags_t oldflags = vma->vm_flags;
    412
    413	if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
    414	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
    415	    vma_is_dax(vma) || vma_is_secretmem(vma))
    416		/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
    417		goto out;
    418
    419	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
    420	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
    421			  vma->vm_file, pgoff, vma_policy(vma),
    422			  vma->vm_userfaultfd_ctx, anon_vma_name(vma));
    423	if (*prev) {
    424		vma = *prev;
    425		goto success;
    426	}
    427
    428	if (start != vma->vm_start) {
    429		ret = split_vma(mm, vma, start, 1);
    430		if (ret)
    431			goto out;
    432	}
    433
    434	if (end != vma->vm_end) {
    435		ret = split_vma(mm, vma, end, 0);
    436		if (ret)
    437			goto out;
    438	}
    439
    440success:
    441	/*
    442	 * Keep track of amount of locked VM.
    443	 */
    444	nr_pages = (end - start) >> PAGE_SHIFT;
    445	if (!(newflags & VM_LOCKED))
    446		nr_pages = -nr_pages;
    447	else if (oldflags & VM_LOCKED)
    448		nr_pages = 0;
    449	mm->locked_vm += nr_pages;
    450
    451	/*
    452	 * vm_flags is protected by the mmap_lock held in write mode.
    453	 * It's okay if try_to_unmap_one unmaps a page just after we
    454	 * set VM_LOCKED, populate_vma_page_range will bring it back.
    455	 */
    456
    457	if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
    458		/* No work to do, and mlocking twice would be wrong */
    459		vma->vm_flags = newflags;
    460	} else {
    461		mlock_vma_pages_range(vma, start, end, newflags);
    462	}
    463out:
    464	*prev = vma;
    465	return ret;
    466}
    467
    468static int apply_vma_lock_flags(unsigned long start, size_t len,
    469				vm_flags_t flags)
    470{
    471	unsigned long nstart, end, tmp;
    472	struct vm_area_struct *vma, *prev;
    473	int error;
    474
    475	VM_BUG_ON(offset_in_page(start));
    476	VM_BUG_ON(len != PAGE_ALIGN(len));
    477	end = start + len;
    478	if (end < start)
    479		return -EINVAL;
    480	if (end == start)
    481		return 0;
    482	vma = find_vma(current->mm, start);
    483	if (!vma || vma->vm_start > start)
    484		return -ENOMEM;
    485
    486	prev = vma->vm_prev;
    487	if (start > vma->vm_start)
    488		prev = vma;
    489
    490	for (nstart = start ; ; ) {
    491		vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
    492
    493		newflags |= flags;
    494
    495		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
    496		tmp = vma->vm_end;
    497		if (tmp > end)
    498			tmp = end;
    499		error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
    500		if (error)
    501			break;
    502		nstart = tmp;
    503		if (nstart < prev->vm_end)
    504			nstart = prev->vm_end;
    505		if (nstart >= end)
    506			break;
    507
    508		vma = prev->vm_next;
    509		if (!vma || vma->vm_start != nstart) {
    510			error = -ENOMEM;
    511			break;
    512		}
    513	}
    514	return error;
    515}
    516
    517/*
    518 * Go through vma areas and sum size of mlocked
    519 * vma pages, as return value.
    520 * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
    521 * is also counted.
    522 * Return value: previously mlocked page counts
    523 */
    524static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
    525		unsigned long start, size_t len)
    526{
    527	struct vm_area_struct *vma;
    528	unsigned long count = 0;
    529
    530	if (mm == NULL)
    531		mm = current->mm;
    532
    533	vma = find_vma(mm, start);
    534	if (vma == NULL)
    535		return 0;
    536
    537	for (; vma ; vma = vma->vm_next) {
    538		if (start >= vma->vm_end)
    539			continue;
    540		if (start + len <=  vma->vm_start)
    541			break;
    542		if (vma->vm_flags & VM_LOCKED) {
    543			if (start > vma->vm_start)
    544				count -= (start - vma->vm_start);
    545			if (start + len < vma->vm_end) {
    546				count += start + len - vma->vm_start;
    547				break;
    548			}
    549			count += vma->vm_end - vma->vm_start;
    550		}
    551	}
    552
    553	return count >> PAGE_SHIFT;
    554}
    555
    556/*
    557 * convert get_user_pages() return value to posix mlock() error
    558 */
    559static int __mlock_posix_error_return(long retval)
    560{
    561	if (retval == -EFAULT)
    562		retval = -ENOMEM;
    563	else if (retval == -ENOMEM)
    564		retval = -EAGAIN;
    565	return retval;
    566}
    567
    568static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
    569{
    570	unsigned long locked;
    571	unsigned long lock_limit;
    572	int error = -ENOMEM;
    573
    574	start = untagged_addr(start);
    575
    576	if (!can_do_mlock())
    577		return -EPERM;
    578
    579	len = PAGE_ALIGN(len + (offset_in_page(start)));
    580	start &= PAGE_MASK;
    581
    582	lock_limit = rlimit(RLIMIT_MEMLOCK);
    583	lock_limit >>= PAGE_SHIFT;
    584	locked = len >> PAGE_SHIFT;
    585
    586	if (mmap_write_lock_killable(current->mm))
    587		return -EINTR;
    588
    589	locked += current->mm->locked_vm;
    590	if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
    591		/*
    592		 * It is possible that the regions requested intersect with
    593		 * previously mlocked areas, that part area in "mm->locked_vm"
    594		 * should not be counted to new mlock increment count. So check
    595		 * and adjust locked count if necessary.
    596		 */
    597		locked -= count_mm_mlocked_page_nr(current->mm,
    598				start, len);
    599	}
    600
    601	/* check against resource limits */
    602	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
    603		error = apply_vma_lock_flags(start, len, flags);
    604
    605	mmap_write_unlock(current->mm);
    606	if (error)
    607		return error;
    608
    609	error = __mm_populate(start, len, 0);
    610	if (error)
    611		return __mlock_posix_error_return(error);
    612	return 0;
    613}
    614
    615SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
    616{
    617	return do_mlock(start, len, VM_LOCKED);
    618}
    619
    620SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
    621{
    622	vm_flags_t vm_flags = VM_LOCKED;
    623
    624	if (flags & ~MLOCK_ONFAULT)
    625		return -EINVAL;
    626
    627	if (flags & MLOCK_ONFAULT)
    628		vm_flags |= VM_LOCKONFAULT;
    629
    630	return do_mlock(start, len, vm_flags);
    631}
    632
    633SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
    634{
    635	int ret;
    636
    637	start = untagged_addr(start);
    638
    639	len = PAGE_ALIGN(len + (offset_in_page(start)));
    640	start &= PAGE_MASK;
    641
    642	if (mmap_write_lock_killable(current->mm))
    643		return -EINTR;
    644	ret = apply_vma_lock_flags(start, len, 0);
    645	mmap_write_unlock(current->mm);
    646
    647	return ret;
    648}
    649
    650/*
    651 * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
    652 * and translate into the appropriate modifications to mm->def_flags and/or the
    653 * flags for all current VMAs.
    654 *
    655 * There are a couple of subtleties with this.  If mlockall() is called multiple
    656 * times with different flags, the values do not necessarily stack.  If mlockall
    657 * is called once including the MCL_FUTURE flag and then a second time without
    658 * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
    659 */
    660static int apply_mlockall_flags(int flags)
    661{
    662	struct vm_area_struct *vma, *prev = NULL;
    663	vm_flags_t to_add = 0;
    664
    665	current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
    666	if (flags & MCL_FUTURE) {
    667		current->mm->def_flags |= VM_LOCKED;
    668
    669		if (flags & MCL_ONFAULT)
    670			current->mm->def_flags |= VM_LOCKONFAULT;
    671
    672		if (!(flags & MCL_CURRENT))
    673			goto out;
    674	}
    675
    676	if (flags & MCL_CURRENT) {
    677		to_add |= VM_LOCKED;
    678		if (flags & MCL_ONFAULT)
    679			to_add |= VM_LOCKONFAULT;
    680	}
    681
    682	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
    683		vm_flags_t newflags;
    684
    685		newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
    686		newflags |= to_add;
    687
    688		/* Ignore errors */
    689		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
    690		cond_resched();
    691	}
    692out:
    693	return 0;
    694}
    695
    696SYSCALL_DEFINE1(mlockall, int, flags)
    697{
    698	unsigned long lock_limit;
    699	int ret;
    700
    701	if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
    702	    flags == MCL_ONFAULT)
    703		return -EINVAL;
    704
    705	if (!can_do_mlock())
    706		return -EPERM;
    707
    708	lock_limit = rlimit(RLIMIT_MEMLOCK);
    709	lock_limit >>= PAGE_SHIFT;
    710
    711	if (mmap_write_lock_killable(current->mm))
    712		return -EINTR;
    713
    714	ret = -ENOMEM;
    715	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
    716	    capable(CAP_IPC_LOCK))
    717		ret = apply_mlockall_flags(flags);
    718	mmap_write_unlock(current->mm);
    719	if (!ret && (flags & MCL_CURRENT))
    720		mm_populate(0, TASK_SIZE);
    721
    722	return ret;
    723}
    724
    725SYSCALL_DEFINE0(munlockall)
    726{
    727	int ret;
    728
    729	if (mmap_write_lock_killable(current->mm))
    730		return -EINTR;
    731	ret = apply_mlockall_flags(0);
    732	mmap_write_unlock(current->mm);
    733	return ret;
    734}
    735
    736/*
    737 * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
    738 * shm segments) get accounted against the user_struct instead.
    739 */
    740static DEFINE_SPINLOCK(shmlock_user_lock);
    741
    742int user_shm_lock(size_t size, struct ucounts *ucounts)
    743{
    744	unsigned long lock_limit, locked;
    745	long memlock;
    746	int allowed = 0;
    747
    748	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
    749	lock_limit = rlimit(RLIMIT_MEMLOCK);
    750	if (lock_limit != RLIM_INFINITY)
    751		lock_limit >>= PAGE_SHIFT;
    752	spin_lock(&shmlock_user_lock);
    753	memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
    754
    755	if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
    756		dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
    757		goto out;
    758	}
    759	if (!get_ucounts(ucounts)) {
    760		dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
    761		allowed = 0;
    762		goto out;
    763	}
    764	allowed = 1;
    765out:
    766	spin_unlock(&shmlock_user_lock);
    767	return allowed;
    768}
    769
    770void user_shm_unlock(size_t size, struct ucounts *ucounts)
    771{
    772	spin_lock(&shmlock_user_lock);
    773	dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
    774	spin_unlock(&shmlock_user_lock);
    775	put_ucounts(ucounts);
    776}