cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

task_mmu.c (49378B)


      1// SPDX-License-Identifier: GPL-2.0
      2#include <linux/pagewalk.h>
      3#include <linux/vmacache.h>
      4#include <linux/mm_inline.h>
      5#include <linux/hugetlb.h>
      6#include <linux/huge_mm.h>
      7#include <linux/mount.h>
      8#include <linux/seq_file.h>
      9#include <linux/highmem.h>
     10#include <linux/ptrace.h>
     11#include <linux/slab.h>
     12#include <linux/pagemap.h>
     13#include <linux/mempolicy.h>
     14#include <linux/rmap.h>
     15#include <linux/swap.h>
     16#include <linux/sched/mm.h>
     17#include <linux/swapops.h>
     18#include <linux/mmu_notifier.h>
     19#include <linux/page_idle.h>
     20#include <linux/shmem_fs.h>
     21#include <linux/uaccess.h>
     22#include <linux/pkeys.h>
     23
     24#include <asm/elf.h>
     25#include <asm/tlb.h>
     26#include <asm/tlbflush.h>
     27#include "internal.h"
     28
     29#define SEQ_PUT_DEC(str, val) \
     30		seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
     31void task_mem(struct seq_file *m, struct mm_struct *mm)
     32{
     33	unsigned long text, lib, swap, anon, file, shmem;
     34	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
     35
     36	anon = get_mm_counter(mm, MM_ANONPAGES);
     37	file = get_mm_counter(mm, MM_FILEPAGES);
     38	shmem = get_mm_counter(mm, MM_SHMEMPAGES);
     39
     40	/*
     41	 * Note: to minimize their overhead, mm maintains hiwater_vm and
     42	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
     43	 * collector of these hiwater stats must therefore get total_vm
     44	 * and rss too, which will usually be the higher.  Barriers? not
     45	 * worth the effort, such snapshots can always be inconsistent.
     46	 */
     47	hiwater_vm = total_vm = mm->total_vm;
     48	if (hiwater_vm < mm->hiwater_vm)
     49		hiwater_vm = mm->hiwater_vm;
     50	hiwater_rss = total_rss = anon + file + shmem;
     51	if (hiwater_rss < mm->hiwater_rss)
     52		hiwater_rss = mm->hiwater_rss;
     53
     54	/* split executable areas between text and lib */
     55	text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
     56	text = min(text, mm->exec_vm << PAGE_SHIFT);
     57	lib = (mm->exec_vm << PAGE_SHIFT) - text;
     58
     59	swap = get_mm_counter(mm, MM_SWAPENTS);
     60	SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
     61	SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
     62	SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
     63	SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
     64	SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
     65	SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
     66	SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
     67	SEQ_PUT_DEC(" kB\nRssFile:\t", file);
     68	SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem);
     69	SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm);
     70	SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm);
     71	seq_put_decimal_ull_width(m,
     72		    " kB\nVmExe:\t", text >> 10, 8);
     73	seq_put_decimal_ull_width(m,
     74		    " kB\nVmLib:\t", lib >> 10, 8);
     75	seq_put_decimal_ull_width(m,
     76		    " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
     77	SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
     78	seq_puts(m, " kB\n");
     79	hugetlb_report_usage(m, mm);
     80}
     81#undef SEQ_PUT_DEC
     82
     83unsigned long task_vsize(struct mm_struct *mm)
     84{
     85	return PAGE_SIZE * mm->total_vm;
     86}
     87
     88unsigned long task_statm(struct mm_struct *mm,
     89			 unsigned long *shared, unsigned long *text,
     90			 unsigned long *data, unsigned long *resident)
     91{
     92	*shared = get_mm_counter(mm, MM_FILEPAGES) +
     93			get_mm_counter(mm, MM_SHMEMPAGES);
     94	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
     95								>> PAGE_SHIFT;
     96	*data = mm->data_vm + mm->stack_vm;
     97	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
     98	return mm->total_vm;
     99}
    100
    101#ifdef CONFIG_NUMA
    102/*
    103 * Save get_task_policy() for show_numa_map().
    104 */
    105static void hold_task_mempolicy(struct proc_maps_private *priv)
    106{
    107	struct task_struct *task = priv->task;
    108
    109	task_lock(task);
    110	priv->task_mempolicy = get_task_policy(task);
    111	mpol_get(priv->task_mempolicy);
    112	task_unlock(task);
    113}
    114static void release_task_mempolicy(struct proc_maps_private *priv)
    115{
    116	mpol_put(priv->task_mempolicy);
    117}
    118#else
    119static void hold_task_mempolicy(struct proc_maps_private *priv)
    120{
    121}
    122static void release_task_mempolicy(struct proc_maps_private *priv)
    123{
    124}
    125#endif
    126
    127static void *m_start(struct seq_file *m, loff_t *ppos)
    128{
    129	struct proc_maps_private *priv = m->private;
    130	unsigned long last_addr = *ppos;
    131	struct mm_struct *mm;
    132	struct vm_area_struct *vma;
    133
    134	/* See m_next(). Zero at the start or after lseek. */
    135	if (last_addr == -1UL)
    136		return NULL;
    137
    138	priv->task = get_proc_task(priv->inode);
    139	if (!priv->task)
    140		return ERR_PTR(-ESRCH);
    141
    142	mm = priv->mm;
    143	if (!mm || !mmget_not_zero(mm)) {
    144		put_task_struct(priv->task);
    145		priv->task = NULL;
    146		return NULL;
    147	}
    148
    149	if (mmap_read_lock_killable(mm)) {
    150		mmput(mm);
    151		put_task_struct(priv->task);
    152		priv->task = NULL;
    153		return ERR_PTR(-EINTR);
    154	}
    155
    156	hold_task_mempolicy(priv);
    157	priv->tail_vma = get_gate_vma(mm);
    158
    159	vma = find_vma(mm, last_addr);
    160	if (vma)
    161		return vma;
    162
    163	return priv->tail_vma;
    164}
    165
    166static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
    167{
    168	struct proc_maps_private *priv = m->private;
    169	struct vm_area_struct *next, *vma = v;
    170
    171	if (vma == priv->tail_vma)
    172		next = NULL;
    173	else if (vma->vm_next)
    174		next = vma->vm_next;
    175	else
    176		next = priv->tail_vma;
    177
    178	*ppos = next ? next->vm_start : -1UL;
    179
    180	return next;
    181}
    182
    183static void m_stop(struct seq_file *m, void *v)
    184{
    185	struct proc_maps_private *priv = m->private;
    186	struct mm_struct *mm = priv->mm;
    187
    188	if (!priv->task)
    189		return;
    190
    191	release_task_mempolicy(priv);
    192	mmap_read_unlock(mm);
    193	mmput(mm);
    194	put_task_struct(priv->task);
    195	priv->task = NULL;
    196}
    197
    198static int proc_maps_open(struct inode *inode, struct file *file,
    199			const struct seq_operations *ops, int psize)
    200{
    201	struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
    202
    203	if (!priv)
    204		return -ENOMEM;
    205
    206	priv->inode = inode;
    207	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
    208	if (IS_ERR(priv->mm)) {
    209		int err = PTR_ERR(priv->mm);
    210
    211		seq_release_private(inode, file);
    212		return err;
    213	}
    214
    215	return 0;
    216}
    217
    218static int proc_map_release(struct inode *inode, struct file *file)
    219{
    220	struct seq_file *seq = file->private_data;
    221	struct proc_maps_private *priv = seq->private;
    222
    223	if (priv->mm)
    224		mmdrop(priv->mm);
    225
    226	return seq_release_private(inode, file);
    227}
    228
    229static int do_maps_open(struct inode *inode, struct file *file,
    230			const struct seq_operations *ops)
    231{
    232	return proc_maps_open(inode, file, ops,
    233				sizeof(struct proc_maps_private));
    234}
    235
    236/*
    237 * Indicate if the VMA is a stack for the given task; for
    238 * /proc/PID/maps that is the stack of the main task.
    239 */
    240static int is_stack(struct vm_area_struct *vma)
    241{
    242	/*
    243	 * We make no effort to guess what a given thread considers to be
    244	 * its "stack".  It's not even well-defined for programs written
    245	 * languages like Go.
    246	 */
    247	return vma->vm_start <= vma->vm_mm->start_stack &&
    248		vma->vm_end >= vma->vm_mm->start_stack;
    249}
    250
    251static void show_vma_header_prefix(struct seq_file *m,
    252				   unsigned long start, unsigned long end,
    253				   vm_flags_t flags, unsigned long long pgoff,
    254				   dev_t dev, unsigned long ino)
    255{
    256	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
    257	seq_put_hex_ll(m, NULL, start, 8);
    258	seq_put_hex_ll(m, "-", end, 8);
    259	seq_putc(m, ' ');
    260	seq_putc(m, flags & VM_READ ? 'r' : '-');
    261	seq_putc(m, flags & VM_WRITE ? 'w' : '-');
    262	seq_putc(m, flags & VM_EXEC ? 'x' : '-');
    263	seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p');
    264	seq_put_hex_ll(m, " ", pgoff, 8);
    265	seq_put_hex_ll(m, " ", MAJOR(dev), 2);
    266	seq_put_hex_ll(m, ":", MINOR(dev), 2);
    267	seq_put_decimal_ull(m, " ", ino);
    268	seq_putc(m, ' ');
    269}
    270
    271static void
    272show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
    273{
    274	struct mm_struct *mm = vma->vm_mm;
    275	struct file *file = vma->vm_file;
    276	vm_flags_t flags = vma->vm_flags;
    277	unsigned long ino = 0;
    278	unsigned long long pgoff = 0;
    279	unsigned long start, end;
    280	dev_t dev = 0;
    281	const char *name = NULL;
    282
    283	if (file) {
    284		struct inode *inode = file_inode(vma->vm_file);
    285		dev = inode->i_sb->s_dev;
    286		ino = inode->i_ino;
    287		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
    288	}
    289
    290	start = vma->vm_start;
    291	end = vma->vm_end;
    292	show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
    293
    294	/*
    295	 * Print the dentry name for named mappings, and a
    296	 * special [heap] marker for the heap:
    297	 */
    298	if (file) {
    299		seq_pad(m, ' ');
    300		seq_file_path(m, file, "\n");
    301		goto done;
    302	}
    303
    304	if (vma->vm_ops && vma->vm_ops->name) {
    305		name = vma->vm_ops->name(vma);
    306		if (name)
    307			goto done;
    308	}
    309
    310	name = arch_vma_name(vma);
    311	if (!name) {
    312		struct anon_vma_name *anon_name;
    313
    314		if (!mm) {
    315			name = "[vdso]";
    316			goto done;
    317		}
    318
    319		if (vma->vm_start <= mm->brk &&
    320		    vma->vm_end >= mm->start_brk) {
    321			name = "[heap]";
    322			goto done;
    323		}
    324
    325		if (is_stack(vma)) {
    326			name = "[stack]";
    327			goto done;
    328		}
    329
    330		anon_name = anon_vma_name(vma);
    331		if (anon_name) {
    332			seq_pad(m, ' ');
    333			seq_printf(m, "[anon:%s]", anon_name->name);
    334		}
    335	}
    336
    337done:
    338	if (name) {
    339		seq_pad(m, ' ');
    340		seq_puts(m, name);
    341	}
    342	seq_putc(m, '\n');
    343}
    344
    345static int show_map(struct seq_file *m, void *v)
    346{
    347	show_map_vma(m, v);
    348	return 0;
    349}
    350
    351static const struct seq_operations proc_pid_maps_op = {
    352	.start	= m_start,
    353	.next	= m_next,
    354	.stop	= m_stop,
    355	.show	= show_map
    356};
    357
    358static int pid_maps_open(struct inode *inode, struct file *file)
    359{
    360	return do_maps_open(inode, file, &proc_pid_maps_op);
    361}
    362
    363const struct file_operations proc_pid_maps_operations = {
    364	.open		= pid_maps_open,
    365	.read		= seq_read,
    366	.llseek		= seq_lseek,
    367	.release	= proc_map_release,
    368};
    369
    370/*
    371 * Proportional Set Size(PSS): my share of RSS.
    372 *
    373 * PSS of a process is the count of pages it has in memory, where each
    374 * page is divided by the number of processes sharing it.  So if a
    375 * process has 1000 pages all to itself, and 1000 shared with one other
    376 * process, its PSS will be 1500.
    377 *
    378 * To keep (accumulated) division errors low, we adopt a 64bit
    379 * fixed-point pss counter to minimize division errors. So (pss >>
    380 * PSS_SHIFT) would be the real byte count.
    381 *
    382 * A shift of 12 before division means (assuming 4K page size):
    383 * 	- 1M 3-user-pages add up to 8KB errors;
    384 * 	- supports mapcount up to 2^24, or 16M;
    385 * 	- supports PSS up to 2^52 bytes, or 4PB.
    386 */
    387#define PSS_SHIFT 12
    388
    389#ifdef CONFIG_PROC_PAGE_MONITOR
    390struct mem_size_stats {
    391	unsigned long resident;
    392	unsigned long shared_clean;
    393	unsigned long shared_dirty;
    394	unsigned long private_clean;
    395	unsigned long private_dirty;
    396	unsigned long referenced;
    397	unsigned long anonymous;
    398	unsigned long lazyfree;
    399	unsigned long anonymous_thp;
    400	unsigned long shmem_thp;
    401	unsigned long file_thp;
    402	unsigned long swap;
    403	unsigned long shared_hugetlb;
    404	unsigned long private_hugetlb;
    405	u64 pss;
    406	u64 pss_anon;
    407	u64 pss_file;
    408	u64 pss_shmem;
    409	u64 pss_locked;
    410	u64 swap_pss;
    411};
    412
    413static void smaps_page_accumulate(struct mem_size_stats *mss,
    414		struct page *page, unsigned long size, unsigned long pss,
    415		bool dirty, bool locked, bool private)
    416{
    417	mss->pss += pss;
    418
    419	if (PageAnon(page))
    420		mss->pss_anon += pss;
    421	else if (PageSwapBacked(page))
    422		mss->pss_shmem += pss;
    423	else
    424		mss->pss_file += pss;
    425
    426	if (locked)
    427		mss->pss_locked += pss;
    428
    429	if (dirty || PageDirty(page)) {
    430		if (private)
    431			mss->private_dirty += size;
    432		else
    433			mss->shared_dirty += size;
    434	} else {
    435		if (private)
    436			mss->private_clean += size;
    437		else
    438			mss->shared_clean += size;
    439	}
    440}
    441
    442static void smaps_account(struct mem_size_stats *mss, struct page *page,
    443		bool compound, bool young, bool dirty, bool locked,
    444		bool migration)
    445{
    446	int i, nr = compound ? compound_nr(page) : 1;
    447	unsigned long size = nr * PAGE_SIZE;
    448
    449	/*
    450	 * First accumulate quantities that depend only on |size| and the type
    451	 * of the compound page.
    452	 */
    453	if (PageAnon(page)) {
    454		mss->anonymous += size;
    455		if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
    456			mss->lazyfree += size;
    457	}
    458
    459	mss->resident += size;
    460	/* Accumulate the size in pages that have been accessed. */
    461	if (young || page_is_young(page) || PageReferenced(page))
    462		mss->referenced += size;
    463
    464	/*
    465	 * Then accumulate quantities that may depend on sharing, or that may
    466	 * differ page-by-page.
    467	 *
    468	 * page_count(page) == 1 guarantees the page is mapped exactly once.
    469	 * If any subpage of the compound page mapped with PTE it would elevate
    470	 * page_count().
    471	 *
    472	 * The page_mapcount() is called to get a snapshot of the mapcount.
    473	 * Without holding the page lock this snapshot can be slightly wrong as
    474	 * we cannot always read the mapcount atomically.  It is not safe to
    475	 * call page_mapcount() even with PTL held if the page is not mapped,
    476	 * especially for migration entries.  Treat regular migration entries
    477	 * as mapcount == 1.
    478	 */
    479	if ((page_count(page) == 1) || migration) {
    480		smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
    481			locked, true);
    482		return;
    483	}
    484	for (i = 0; i < nr; i++, page++) {
    485		int mapcount = page_mapcount(page);
    486		unsigned long pss = PAGE_SIZE << PSS_SHIFT;
    487		if (mapcount >= 2)
    488			pss /= mapcount;
    489		smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
    490				      mapcount < 2);
    491	}
    492}
    493
    494#ifdef CONFIG_SHMEM
    495static int smaps_pte_hole(unsigned long addr, unsigned long end,
    496			  __always_unused int depth, struct mm_walk *walk)
    497{
    498	struct mem_size_stats *mss = walk->private;
    499	struct vm_area_struct *vma = walk->vma;
    500
    501	mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
    502					      linear_page_index(vma, addr),
    503					      linear_page_index(vma, end));
    504
    505	return 0;
    506}
    507#else
    508#define smaps_pte_hole		NULL
    509#endif /* CONFIG_SHMEM */
    510
    511static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
    512{
    513#ifdef CONFIG_SHMEM
    514	if (walk->ops->pte_hole) {
    515		/* depth is not used */
    516		smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
    517	}
    518#endif
    519}
    520
    521static void smaps_pte_entry(pte_t *pte, unsigned long addr,
    522		struct mm_walk *walk)
    523{
    524	struct mem_size_stats *mss = walk->private;
    525	struct vm_area_struct *vma = walk->vma;
    526	bool locked = !!(vma->vm_flags & VM_LOCKED);
    527	struct page *page = NULL;
    528	bool migration = false;
    529
    530	if (pte_present(*pte)) {
    531		page = vm_normal_page(vma, addr, *pte);
    532	} else if (is_swap_pte(*pte)) {
    533		swp_entry_t swpent = pte_to_swp_entry(*pte);
    534
    535		if (!non_swap_entry(swpent)) {
    536			int mapcount;
    537
    538			mss->swap += PAGE_SIZE;
    539			mapcount = swp_swapcount(swpent);
    540			if (mapcount >= 2) {
    541				u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
    542
    543				do_div(pss_delta, mapcount);
    544				mss->swap_pss += pss_delta;
    545			} else {
    546				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
    547			}
    548		} else if (is_pfn_swap_entry(swpent)) {
    549			if (is_migration_entry(swpent))
    550				migration = true;
    551			page = pfn_swap_entry_to_page(swpent);
    552		}
    553	} else {
    554		smaps_pte_hole_lookup(addr, walk);
    555		return;
    556	}
    557
    558	if (!page)
    559		return;
    560
    561	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
    562		      locked, migration);
    563}
    564
    565#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    566static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
    567		struct mm_walk *walk)
    568{
    569	struct mem_size_stats *mss = walk->private;
    570	struct vm_area_struct *vma = walk->vma;
    571	bool locked = !!(vma->vm_flags & VM_LOCKED);
    572	struct page *page = NULL;
    573	bool migration = false;
    574
    575	if (pmd_present(*pmd)) {
    576		/* FOLL_DUMP will return -EFAULT on huge zero page */
    577		page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
    578	} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
    579		swp_entry_t entry = pmd_to_swp_entry(*pmd);
    580
    581		if (is_migration_entry(entry)) {
    582			migration = true;
    583			page = pfn_swap_entry_to_page(entry);
    584		}
    585	}
    586	if (IS_ERR_OR_NULL(page))
    587		return;
    588	if (PageAnon(page))
    589		mss->anonymous_thp += HPAGE_PMD_SIZE;
    590	else if (PageSwapBacked(page))
    591		mss->shmem_thp += HPAGE_PMD_SIZE;
    592	else if (is_zone_device_page(page))
    593		/* pass */;
    594	else
    595		mss->file_thp += HPAGE_PMD_SIZE;
    596
    597	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
    598		      locked, migration);
    599}
    600#else
    601static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
    602		struct mm_walk *walk)
    603{
    604}
    605#endif
    606
    607static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
    608			   struct mm_walk *walk)
    609{
    610	struct vm_area_struct *vma = walk->vma;
    611	pte_t *pte;
    612	spinlock_t *ptl;
    613
    614	ptl = pmd_trans_huge_lock(pmd, vma);
    615	if (ptl) {
    616		smaps_pmd_entry(pmd, addr, walk);
    617		spin_unlock(ptl);
    618		goto out;
    619	}
    620
    621	if (pmd_trans_unstable(pmd))
    622		goto out;
    623	/*
    624	 * The mmap_lock held all the way back in m_start() is what
    625	 * keeps khugepaged out of here and from collapsing things
    626	 * in here.
    627	 */
    628	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
    629	for (; addr != end; pte++, addr += PAGE_SIZE)
    630		smaps_pte_entry(pte, addr, walk);
    631	pte_unmap_unlock(pte - 1, ptl);
    632out:
    633	cond_resched();
    634	return 0;
    635}
    636
    637static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
    638{
    639	/*
    640	 * Don't forget to update Documentation/ on changes.
    641	 */
    642	static const char mnemonics[BITS_PER_LONG][2] = {
    643		/*
    644		 * In case if we meet a flag we don't know about.
    645		 */
    646		[0 ... (BITS_PER_LONG-1)] = "??",
    647
    648		[ilog2(VM_READ)]	= "rd",
    649		[ilog2(VM_WRITE)]	= "wr",
    650		[ilog2(VM_EXEC)]	= "ex",
    651		[ilog2(VM_SHARED)]	= "sh",
    652		[ilog2(VM_MAYREAD)]	= "mr",
    653		[ilog2(VM_MAYWRITE)]	= "mw",
    654		[ilog2(VM_MAYEXEC)]	= "me",
    655		[ilog2(VM_MAYSHARE)]	= "ms",
    656		[ilog2(VM_GROWSDOWN)]	= "gd",
    657		[ilog2(VM_PFNMAP)]	= "pf",
    658		[ilog2(VM_LOCKED)]	= "lo",
    659		[ilog2(VM_IO)]		= "io",
    660		[ilog2(VM_SEQ_READ)]	= "sr",
    661		[ilog2(VM_RAND_READ)]	= "rr",
    662		[ilog2(VM_DONTCOPY)]	= "dc",
    663		[ilog2(VM_DONTEXPAND)]	= "de",
    664		[ilog2(VM_ACCOUNT)]	= "ac",
    665		[ilog2(VM_NORESERVE)]	= "nr",
    666		[ilog2(VM_HUGETLB)]	= "ht",
    667		[ilog2(VM_SYNC)]	= "sf",
    668		[ilog2(VM_ARCH_1)]	= "ar",
    669		[ilog2(VM_WIPEONFORK)]	= "wf",
    670		[ilog2(VM_DONTDUMP)]	= "dd",
    671#ifdef CONFIG_ARM64_BTI
    672		[ilog2(VM_ARM64_BTI)]	= "bt",
    673#endif
    674#ifdef CONFIG_MEM_SOFT_DIRTY
    675		[ilog2(VM_SOFTDIRTY)]	= "sd",
    676#endif
    677		[ilog2(VM_MIXEDMAP)]	= "mm",
    678		[ilog2(VM_HUGEPAGE)]	= "hg",
    679		[ilog2(VM_NOHUGEPAGE)]	= "nh",
    680		[ilog2(VM_MERGEABLE)]	= "mg",
    681		[ilog2(VM_UFFD_MISSING)]= "um",
    682		[ilog2(VM_UFFD_WP)]	= "uw",
    683#ifdef CONFIG_ARM64_MTE
    684		[ilog2(VM_MTE)]		= "mt",
    685		[ilog2(VM_MTE_ALLOWED)]	= "",
    686#endif
    687#ifdef CONFIG_ARCH_HAS_PKEYS
    688		/* These come out via ProtectionKey: */
    689		[ilog2(VM_PKEY_BIT0)]	= "",
    690		[ilog2(VM_PKEY_BIT1)]	= "",
    691		[ilog2(VM_PKEY_BIT2)]	= "",
    692		[ilog2(VM_PKEY_BIT3)]	= "",
    693#if VM_PKEY_BIT4
    694		[ilog2(VM_PKEY_BIT4)]	= "",
    695#endif
    696#endif /* CONFIG_ARCH_HAS_PKEYS */
    697#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
    698		[ilog2(VM_UFFD_MINOR)]	= "ui",
    699#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
    700	};
    701	size_t i;
    702
    703	seq_puts(m, "VmFlags: ");
    704	for (i = 0; i < BITS_PER_LONG; i++) {
    705		if (!mnemonics[i][0])
    706			continue;
    707		if (vma->vm_flags & (1UL << i)) {
    708			seq_putc(m, mnemonics[i][0]);
    709			seq_putc(m, mnemonics[i][1]);
    710			seq_putc(m, ' ');
    711		}
    712	}
    713	seq_putc(m, '\n');
    714}
    715
    716#ifdef CONFIG_HUGETLB_PAGE
    717static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
    718				 unsigned long addr, unsigned long end,
    719				 struct mm_walk *walk)
    720{
    721	struct mem_size_stats *mss = walk->private;
    722	struct vm_area_struct *vma = walk->vma;
    723	struct page *page = NULL;
    724
    725	if (pte_present(*pte)) {
    726		page = vm_normal_page(vma, addr, *pte);
    727	} else if (is_swap_pte(*pte)) {
    728		swp_entry_t swpent = pte_to_swp_entry(*pte);
    729
    730		if (is_pfn_swap_entry(swpent))
    731			page = pfn_swap_entry_to_page(swpent);
    732	}
    733	if (page) {
    734		int mapcount = page_mapcount(page);
    735
    736		if (mapcount >= 2)
    737			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
    738		else
    739			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
    740	}
    741	return 0;
    742}
    743#else
    744#define smaps_hugetlb_range	NULL
    745#endif /* HUGETLB_PAGE */
    746
    747static const struct mm_walk_ops smaps_walk_ops = {
    748	.pmd_entry		= smaps_pte_range,
    749	.hugetlb_entry		= smaps_hugetlb_range,
    750};
    751
    752static const struct mm_walk_ops smaps_shmem_walk_ops = {
    753	.pmd_entry		= smaps_pte_range,
    754	.hugetlb_entry		= smaps_hugetlb_range,
    755	.pte_hole		= smaps_pte_hole,
    756};
    757
    758/*
    759 * Gather mem stats from @vma with the indicated beginning
    760 * address @start, and keep them in @mss.
    761 *
    762 * Use vm_start of @vma as the beginning address if @start is 0.
    763 */
    764static void smap_gather_stats(struct vm_area_struct *vma,
    765		struct mem_size_stats *mss, unsigned long start)
    766{
    767	const struct mm_walk_ops *ops = &smaps_walk_ops;
    768
    769	/* Invalid start */
    770	if (start >= vma->vm_end)
    771		return;
    772
    773#ifdef CONFIG_SHMEM
    774	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
    775		/*
    776		 * For shared or readonly shmem mappings we know that all
    777		 * swapped out pages belong to the shmem object, and we can
    778		 * obtain the swap value much more efficiently. For private
    779		 * writable mappings, we might have COW pages that are
    780		 * not affected by the parent swapped out pages of the shmem
    781		 * object, so we have to distinguish them during the page walk.
    782		 * Unless we know that the shmem object (or the part mapped by
    783		 * our VMA) has no swapped out pages at all.
    784		 */
    785		unsigned long shmem_swapped = shmem_swap_usage(vma);
    786
    787		if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
    788					!(vma->vm_flags & VM_WRITE))) {
    789			mss->swap += shmem_swapped;
    790		} else {
    791			ops = &smaps_shmem_walk_ops;
    792		}
    793	}
    794#endif
    795	/* mmap_lock is held in m_start */
    796	if (!start)
    797		walk_page_vma(vma, ops, mss);
    798	else
    799		walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
    800}
    801
    802#define SEQ_PUT_DEC(str, val) \
    803		seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
    804
    805/* Show the contents common for smaps and smaps_rollup */
    806static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
    807	bool rollup_mode)
    808{
    809	SEQ_PUT_DEC("Rss:            ", mss->resident);
    810	SEQ_PUT_DEC(" kB\nPss:            ", mss->pss >> PSS_SHIFT);
    811	if (rollup_mode) {
    812		/*
    813		 * These are meaningful only for smaps_rollup, otherwise two of
    814		 * them are zero, and the other one is the same as Pss.
    815		 */
    816		SEQ_PUT_DEC(" kB\nPss_Anon:       ",
    817			mss->pss_anon >> PSS_SHIFT);
    818		SEQ_PUT_DEC(" kB\nPss_File:       ",
    819			mss->pss_file >> PSS_SHIFT);
    820		SEQ_PUT_DEC(" kB\nPss_Shmem:      ",
    821			mss->pss_shmem >> PSS_SHIFT);
    822	}
    823	SEQ_PUT_DEC(" kB\nShared_Clean:   ", mss->shared_clean);
    824	SEQ_PUT_DEC(" kB\nShared_Dirty:   ", mss->shared_dirty);
    825	SEQ_PUT_DEC(" kB\nPrivate_Clean:  ", mss->private_clean);
    826	SEQ_PUT_DEC(" kB\nPrivate_Dirty:  ", mss->private_dirty);
    827	SEQ_PUT_DEC(" kB\nReferenced:     ", mss->referenced);
    828	SEQ_PUT_DEC(" kB\nAnonymous:      ", mss->anonymous);
    829	SEQ_PUT_DEC(" kB\nLazyFree:       ", mss->lazyfree);
    830	SEQ_PUT_DEC(" kB\nAnonHugePages:  ", mss->anonymous_thp);
    831	SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
    832	SEQ_PUT_DEC(" kB\nFilePmdMapped:  ", mss->file_thp);
    833	SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
    834	seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
    835				  mss->private_hugetlb >> 10, 7);
    836	SEQ_PUT_DEC(" kB\nSwap:           ", mss->swap);
    837	SEQ_PUT_DEC(" kB\nSwapPss:        ",
    838					mss->swap_pss >> PSS_SHIFT);
    839	SEQ_PUT_DEC(" kB\nLocked:         ",
    840					mss->pss_locked >> PSS_SHIFT);
    841	seq_puts(m, " kB\n");
    842}
    843
    844static int show_smap(struct seq_file *m, void *v)
    845{
    846	struct vm_area_struct *vma = v;
    847	struct mem_size_stats mss;
    848
    849	memset(&mss, 0, sizeof(mss));
    850
    851	smap_gather_stats(vma, &mss, 0);
    852
    853	show_map_vma(m, vma);
    854
    855	SEQ_PUT_DEC("Size:           ", vma->vm_end - vma->vm_start);
    856	SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
    857	SEQ_PUT_DEC(" kB\nMMUPageSize:    ", vma_mmu_pagesize(vma));
    858	seq_puts(m, " kB\n");
    859
    860	__show_smap(m, &mss, false);
    861
    862	seq_printf(m, "THPeligible:    %d\n",
    863		   transparent_hugepage_active(vma));
    864
    865	if (arch_pkeys_enabled())
    866		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
    867	show_smap_vma_flags(m, vma);
    868
    869	return 0;
    870}
    871
    872static int show_smaps_rollup(struct seq_file *m, void *v)
    873{
    874	struct proc_maps_private *priv = m->private;
    875	struct mem_size_stats mss;
    876	struct mm_struct *mm;
    877	struct vm_area_struct *vma;
    878	unsigned long last_vma_end = 0;
    879	int ret = 0;
    880
    881	priv->task = get_proc_task(priv->inode);
    882	if (!priv->task)
    883		return -ESRCH;
    884
    885	mm = priv->mm;
    886	if (!mm || !mmget_not_zero(mm)) {
    887		ret = -ESRCH;
    888		goto out_put_task;
    889	}
    890
    891	memset(&mss, 0, sizeof(mss));
    892
    893	ret = mmap_read_lock_killable(mm);
    894	if (ret)
    895		goto out_put_mm;
    896
    897	hold_task_mempolicy(priv);
    898
    899	for (vma = priv->mm->mmap; vma;) {
    900		smap_gather_stats(vma, &mss, 0);
    901		last_vma_end = vma->vm_end;
    902
    903		/*
    904		 * Release mmap_lock temporarily if someone wants to
    905		 * access it for write request.
    906		 */
    907		if (mmap_lock_is_contended(mm)) {
    908			mmap_read_unlock(mm);
    909			ret = mmap_read_lock_killable(mm);
    910			if (ret) {
    911				release_task_mempolicy(priv);
    912				goto out_put_mm;
    913			}
    914
    915			/*
    916			 * After dropping the lock, there are four cases to
    917			 * consider. See the following example for explanation.
    918			 *
    919			 *   +------+------+-----------+
    920			 *   | VMA1 | VMA2 | VMA3      |
    921			 *   +------+------+-----------+
    922			 *   |      |      |           |
    923			 *  4k     8k     16k         400k
    924			 *
    925			 * Suppose we drop the lock after reading VMA2 due to
    926			 * contention, then we get:
    927			 *
    928			 *	last_vma_end = 16k
    929			 *
    930			 * 1) VMA2 is freed, but VMA3 exists:
    931			 *
    932			 *    find_vma(mm, 16k - 1) will return VMA3.
    933			 *    In this case, just continue from VMA3.
    934			 *
    935			 * 2) VMA2 still exists:
    936			 *
    937			 *    find_vma(mm, 16k - 1) will return VMA2.
    938			 *    Iterate the loop like the original one.
    939			 *
    940			 * 3) No more VMAs can be found:
    941			 *
    942			 *    find_vma(mm, 16k - 1) will return NULL.
    943			 *    No more things to do, just break.
    944			 *
    945			 * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
    946			 *
    947			 *    find_vma(mm, 16k - 1) will return VMA' whose range
    948			 *    contains last_vma_end.
    949			 *    Iterate VMA' from last_vma_end.
    950			 */
    951			vma = find_vma(mm, last_vma_end - 1);
    952			/* Case 3 above */
    953			if (!vma)
    954				break;
    955
    956			/* Case 1 above */
    957			if (vma->vm_start >= last_vma_end)
    958				continue;
    959
    960			/* Case 4 above */
    961			if (vma->vm_end > last_vma_end)
    962				smap_gather_stats(vma, &mss, last_vma_end);
    963		}
    964		/* Case 2 above */
    965		vma = vma->vm_next;
    966	}
    967
    968	show_vma_header_prefix(m, priv->mm->mmap->vm_start,
    969			       last_vma_end, 0, 0, 0, 0);
    970	seq_pad(m, ' ');
    971	seq_puts(m, "[rollup]\n");
    972
    973	__show_smap(m, &mss, true);
    974
    975	release_task_mempolicy(priv);
    976	mmap_read_unlock(mm);
    977
    978out_put_mm:
    979	mmput(mm);
    980out_put_task:
    981	put_task_struct(priv->task);
    982	priv->task = NULL;
    983
    984	return ret;
    985}
    986#undef SEQ_PUT_DEC
    987
    988static const struct seq_operations proc_pid_smaps_op = {
    989	.start	= m_start,
    990	.next	= m_next,
    991	.stop	= m_stop,
    992	.show	= show_smap
    993};
    994
    995static int pid_smaps_open(struct inode *inode, struct file *file)
    996{
    997	return do_maps_open(inode, file, &proc_pid_smaps_op);
    998}
    999
   1000static int smaps_rollup_open(struct inode *inode, struct file *file)
   1001{
   1002	int ret;
   1003	struct proc_maps_private *priv;
   1004
   1005	priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT);
   1006	if (!priv)
   1007		return -ENOMEM;
   1008
   1009	ret = single_open(file, show_smaps_rollup, priv);
   1010	if (ret)
   1011		goto out_free;
   1012
   1013	priv->inode = inode;
   1014	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
   1015	if (IS_ERR(priv->mm)) {
   1016		ret = PTR_ERR(priv->mm);
   1017
   1018		single_release(inode, file);
   1019		goto out_free;
   1020	}
   1021
   1022	return 0;
   1023
   1024out_free:
   1025	kfree(priv);
   1026	return ret;
   1027}
   1028
   1029static int smaps_rollup_release(struct inode *inode, struct file *file)
   1030{
   1031	struct seq_file *seq = file->private_data;
   1032	struct proc_maps_private *priv = seq->private;
   1033
   1034	if (priv->mm)
   1035		mmdrop(priv->mm);
   1036
   1037	kfree(priv);
   1038	return single_release(inode, file);
   1039}
   1040
   1041const struct file_operations proc_pid_smaps_operations = {
   1042	.open		= pid_smaps_open,
   1043	.read		= seq_read,
   1044	.llseek		= seq_lseek,
   1045	.release	= proc_map_release,
   1046};
   1047
   1048const struct file_operations proc_pid_smaps_rollup_operations = {
   1049	.open		= smaps_rollup_open,
   1050	.read		= seq_read,
   1051	.llseek		= seq_lseek,
   1052	.release	= smaps_rollup_release,
   1053};
   1054
   1055enum clear_refs_types {
   1056	CLEAR_REFS_ALL = 1,
   1057	CLEAR_REFS_ANON,
   1058	CLEAR_REFS_MAPPED,
   1059	CLEAR_REFS_SOFT_DIRTY,
   1060	CLEAR_REFS_MM_HIWATER_RSS,
   1061	CLEAR_REFS_LAST,
   1062};
   1063
   1064struct clear_refs_private {
   1065	enum clear_refs_types type;
   1066};
   1067
   1068#ifdef CONFIG_MEM_SOFT_DIRTY
   1069
   1070static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
   1071{
   1072	struct page *page;
   1073
   1074	if (!pte_write(pte))
   1075		return false;
   1076	if (!is_cow_mapping(vma->vm_flags))
   1077		return false;
   1078	if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
   1079		return false;
   1080	page = vm_normal_page(vma, addr, pte);
   1081	if (!page)
   1082		return false;
   1083	return page_maybe_dma_pinned(page);
   1084}
   1085
   1086static inline void clear_soft_dirty(struct vm_area_struct *vma,
   1087		unsigned long addr, pte_t *pte)
   1088{
   1089	/*
   1090	 * The soft-dirty tracker uses #PF-s to catch writes
   1091	 * to pages, so write-protect the pte as well. See the
   1092	 * Documentation/admin-guide/mm/soft-dirty.rst for full description
   1093	 * of how soft-dirty works.
   1094	 */
   1095	pte_t ptent = *pte;
   1096
   1097	if (pte_present(ptent)) {
   1098		pte_t old_pte;
   1099
   1100		if (pte_is_pinned(vma, addr, ptent))
   1101			return;
   1102		old_pte = ptep_modify_prot_start(vma, addr, pte);
   1103		ptent = pte_wrprotect(old_pte);
   1104		ptent = pte_clear_soft_dirty(ptent);
   1105		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
   1106	} else if (is_swap_pte(ptent)) {
   1107		ptent = pte_swp_clear_soft_dirty(ptent);
   1108		set_pte_at(vma->vm_mm, addr, pte, ptent);
   1109	}
   1110}
   1111#else
   1112static inline void clear_soft_dirty(struct vm_area_struct *vma,
   1113		unsigned long addr, pte_t *pte)
   1114{
   1115}
   1116#endif
   1117
   1118#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
   1119static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
   1120		unsigned long addr, pmd_t *pmdp)
   1121{
   1122	pmd_t old, pmd = *pmdp;
   1123
   1124	if (pmd_present(pmd)) {
   1125		/* See comment in change_huge_pmd() */
   1126		old = pmdp_invalidate(vma, addr, pmdp);
   1127		if (pmd_dirty(old))
   1128			pmd = pmd_mkdirty(pmd);
   1129		if (pmd_young(old))
   1130			pmd = pmd_mkyoung(pmd);
   1131
   1132		pmd = pmd_wrprotect(pmd);
   1133		pmd = pmd_clear_soft_dirty(pmd);
   1134
   1135		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
   1136	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
   1137		pmd = pmd_swp_clear_soft_dirty(pmd);
   1138		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
   1139	}
   1140}
   1141#else
   1142static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
   1143		unsigned long addr, pmd_t *pmdp)
   1144{
   1145}
   1146#endif
   1147
   1148static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
   1149				unsigned long end, struct mm_walk *walk)
   1150{
   1151	struct clear_refs_private *cp = walk->private;
   1152	struct vm_area_struct *vma = walk->vma;
   1153	pte_t *pte, ptent;
   1154	spinlock_t *ptl;
   1155	struct page *page;
   1156
   1157	ptl = pmd_trans_huge_lock(pmd, vma);
   1158	if (ptl) {
   1159		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
   1160			clear_soft_dirty_pmd(vma, addr, pmd);
   1161			goto out;
   1162		}
   1163
   1164		if (!pmd_present(*pmd))
   1165			goto out;
   1166
   1167		page = pmd_page(*pmd);
   1168
   1169		/* Clear accessed and referenced bits. */
   1170		pmdp_test_and_clear_young(vma, addr, pmd);
   1171		test_and_clear_page_young(page);
   1172		ClearPageReferenced(page);
   1173out:
   1174		spin_unlock(ptl);
   1175		return 0;
   1176	}
   1177
   1178	if (pmd_trans_unstable(pmd))
   1179		return 0;
   1180
   1181	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
   1182	for (; addr != end; pte++, addr += PAGE_SIZE) {
   1183		ptent = *pte;
   1184
   1185		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
   1186			clear_soft_dirty(vma, addr, pte);
   1187			continue;
   1188		}
   1189
   1190		if (!pte_present(ptent))
   1191			continue;
   1192
   1193		page = vm_normal_page(vma, addr, ptent);
   1194		if (!page)
   1195			continue;
   1196
   1197		/* Clear accessed and referenced bits. */
   1198		ptep_test_and_clear_young(vma, addr, pte);
   1199		test_and_clear_page_young(page);
   1200		ClearPageReferenced(page);
   1201	}
   1202	pte_unmap_unlock(pte - 1, ptl);
   1203	cond_resched();
   1204	return 0;
   1205}
   1206
   1207static int clear_refs_test_walk(unsigned long start, unsigned long end,
   1208				struct mm_walk *walk)
   1209{
   1210	struct clear_refs_private *cp = walk->private;
   1211	struct vm_area_struct *vma = walk->vma;
   1212
   1213	if (vma->vm_flags & VM_PFNMAP)
   1214		return 1;
   1215
   1216	/*
   1217	 * Writing 1 to /proc/pid/clear_refs affects all pages.
   1218	 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
   1219	 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
   1220	 * Writing 4 to /proc/pid/clear_refs affects all pages.
   1221	 */
   1222	if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
   1223		return 1;
   1224	if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
   1225		return 1;
   1226	return 0;
   1227}
   1228
   1229static const struct mm_walk_ops clear_refs_walk_ops = {
   1230	.pmd_entry		= clear_refs_pte_range,
   1231	.test_walk		= clear_refs_test_walk,
   1232};
   1233
   1234static ssize_t clear_refs_write(struct file *file, const char __user *buf,
   1235				size_t count, loff_t *ppos)
   1236{
   1237	struct task_struct *task;
   1238	char buffer[PROC_NUMBUF];
   1239	struct mm_struct *mm;
   1240	struct vm_area_struct *vma;
   1241	enum clear_refs_types type;
   1242	int itype;
   1243	int rv;
   1244
   1245	memset(buffer, 0, sizeof(buffer));
   1246	if (count > sizeof(buffer) - 1)
   1247		count = sizeof(buffer) - 1;
   1248	if (copy_from_user(buffer, buf, count))
   1249		return -EFAULT;
   1250	rv = kstrtoint(strstrip(buffer), 10, &itype);
   1251	if (rv < 0)
   1252		return rv;
   1253	type = (enum clear_refs_types)itype;
   1254	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
   1255		return -EINVAL;
   1256
   1257	task = get_proc_task(file_inode(file));
   1258	if (!task)
   1259		return -ESRCH;
   1260	mm = get_task_mm(task);
   1261	if (mm) {
   1262		struct mmu_notifier_range range;
   1263		struct clear_refs_private cp = {
   1264			.type = type,
   1265		};
   1266
   1267		if (mmap_write_lock_killable(mm)) {
   1268			count = -EINTR;
   1269			goto out_mm;
   1270		}
   1271		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
   1272			/*
   1273			 * Writing 5 to /proc/pid/clear_refs resets the peak
   1274			 * resident set size to this mm's current rss value.
   1275			 */
   1276			reset_mm_hiwater_rss(mm);
   1277			goto out_unlock;
   1278		}
   1279
   1280		if (type == CLEAR_REFS_SOFT_DIRTY) {
   1281			for (vma = mm->mmap; vma; vma = vma->vm_next) {
   1282				if (!(vma->vm_flags & VM_SOFTDIRTY))
   1283					continue;
   1284				vma->vm_flags &= ~VM_SOFTDIRTY;
   1285				vma_set_page_prot(vma);
   1286			}
   1287
   1288			inc_tlb_flush_pending(mm);
   1289			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
   1290						0, NULL, mm, 0, -1UL);
   1291			mmu_notifier_invalidate_range_start(&range);
   1292		}
   1293		walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
   1294				&cp);
   1295		if (type == CLEAR_REFS_SOFT_DIRTY) {
   1296			mmu_notifier_invalidate_range_end(&range);
   1297			flush_tlb_mm(mm);
   1298			dec_tlb_flush_pending(mm);
   1299		}
   1300out_unlock:
   1301		mmap_write_unlock(mm);
   1302out_mm:
   1303		mmput(mm);
   1304	}
   1305	put_task_struct(task);
   1306
   1307	return count;
   1308}
   1309
   1310const struct file_operations proc_clear_refs_operations = {
   1311	.write		= clear_refs_write,
   1312	.llseek		= noop_llseek,
   1313};
   1314
   1315typedef struct {
   1316	u64 pme;
   1317} pagemap_entry_t;
   1318
   1319struct pagemapread {
   1320	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */
   1321	pagemap_entry_t *buffer;
   1322	bool show_pfn;
   1323};
   1324
   1325#define PAGEMAP_WALK_SIZE	(PMD_SIZE)
   1326#define PAGEMAP_WALK_MASK	(PMD_MASK)
   1327
   1328#define PM_ENTRY_BYTES		sizeof(pagemap_entry_t)
   1329#define PM_PFRAME_BITS		55
   1330#define PM_PFRAME_MASK		GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
   1331#define PM_SOFT_DIRTY		BIT_ULL(55)
   1332#define PM_MMAP_EXCLUSIVE	BIT_ULL(56)
   1333#define PM_UFFD_WP		BIT_ULL(57)
   1334#define PM_FILE			BIT_ULL(61)
   1335#define PM_SWAP			BIT_ULL(62)
   1336#define PM_PRESENT		BIT_ULL(63)
   1337
   1338#define PM_END_OF_BUFFER    1
   1339
   1340static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
   1341{
   1342	return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
   1343}
   1344
   1345static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
   1346			  struct pagemapread *pm)
   1347{
   1348	pm->buffer[pm->pos++] = *pme;
   1349	if (pm->pos >= pm->len)
   1350		return PM_END_OF_BUFFER;
   1351	return 0;
   1352}
   1353
   1354static int pagemap_pte_hole(unsigned long start, unsigned long end,
   1355			    __always_unused int depth, struct mm_walk *walk)
   1356{
   1357	struct pagemapread *pm = walk->private;
   1358	unsigned long addr = start;
   1359	int err = 0;
   1360
   1361	while (addr < end) {
   1362		struct vm_area_struct *vma = find_vma(walk->mm, addr);
   1363		pagemap_entry_t pme = make_pme(0, 0);
   1364		/* End of address space hole, which we mark as non-present. */
   1365		unsigned long hole_end;
   1366
   1367		if (vma)
   1368			hole_end = min(end, vma->vm_start);
   1369		else
   1370			hole_end = end;
   1371
   1372		for (; addr < hole_end; addr += PAGE_SIZE) {
   1373			err = add_to_pagemap(addr, &pme, pm);
   1374			if (err)
   1375				goto out;
   1376		}
   1377
   1378		if (!vma)
   1379			break;
   1380
   1381		/* Addresses in the VMA. */
   1382		if (vma->vm_flags & VM_SOFTDIRTY)
   1383			pme = make_pme(0, PM_SOFT_DIRTY);
   1384		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
   1385			err = add_to_pagemap(addr, &pme, pm);
   1386			if (err)
   1387				goto out;
   1388		}
   1389	}
   1390out:
   1391	return err;
   1392}
   1393
   1394static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
   1395		struct vm_area_struct *vma, unsigned long addr, pte_t pte)
   1396{
   1397	u64 frame = 0, flags = 0;
   1398	struct page *page = NULL;
   1399	bool migration = false;
   1400
   1401	if (pte_present(pte)) {
   1402		if (pm->show_pfn)
   1403			frame = pte_pfn(pte);
   1404		flags |= PM_PRESENT;
   1405		page = vm_normal_page(vma, addr, pte);
   1406		if (pte_soft_dirty(pte))
   1407			flags |= PM_SOFT_DIRTY;
   1408		if (pte_uffd_wp(pte))
   1409			flags |= PM_UFFD_WP;
   1410	} else if (is_swap_pte(pte)) {
   1411		swp_entry_t entry;
   1412		if (pte_swp_soft_dirty(pte))
   1413			flags |= PM_SOFT_DIRTY;
   1414		if (pte_swp_uffd_wp(pte))
   1415			flags |= PM_UFFD_WP;
   1416		entry = pte_to_swp_entry(pte);
   1417		if (pm->show_pfn)
   1418			frame = swp_type(entry) |
   1419				(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
   1420		flags |= PM_SWAP;
   1421		migration = is_migration_entry(entry);
   1422		if (is_pfn_swap_entry(entry))
   1423			page = pfn_swap_entry_to_page(entry);
   1424		if (pte_marker_entry_uffd_wp(entry))
   1425			flags |= PM_UFFD_WP;
   1426	}
   1427
   1428	if (page && !PageAnon(page))
   1429		flags |= PM_FILE;
   1430	if (page && !migration && page_mapcount(page) == 1)
   1431		flags |= PM_MMAP_EXCLUSIVE;
   1432	if (vma->vm_flags & VM_SOFTDIRTY)
   1433		flags |= PM_SOFT_DIRTY;
   1434
   1435	return make_pme(frame, flags);
   1436}
   1437
   1438static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
   1439			     struct mm_walk *walk)
   1440{
   1441	struct vm_area_struct *vma = walk->vma;
   1442	struct pagemapread *pm = walk->private;
   1443	spinlock_t *ptl;
   1444	pte_t *pte, *orig_pte;
   1445	int err = 0;
   1446#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1447	bool migration = false;
   1448
   1449	ptl = pmd_trans_huge_lock(pmdp, vma);
   1450	if (ptl) {
   1451		u64 flags = 0, frame = 0;
   1452		pmd_t pmd = *pmdp;
   1453		struct page *page = NULL;
   1454
   1455		if (vma->vm_flags & VM_SOFTDIRTY)
   1456			flags |= PM_SOFT_DIRTY;
   1457
   1458		if (pmd_present(pmd)) {
   1459			page = pmd_page(pmd);
   1460
   1461			flags |= PM_PRESENT;
   1462			if (pmd_soft_dirty(pmd))
   1463				flags |= PM_SOFT_DIRTY;
   1464			if (pmd_uffd_wp(pmd))
   1465				flags |= PM_UFFD_WP;
   1466			if (pm->show_pfn)
   1467				frame = pmd_pfn(pmd) +
   1468					((addr & ~PMD_MASK) >> PAGE_SHIFT);
   1469		}
   1470#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
   1471		else if (is_swap_pmd(pmd)) {
   1472			swp_entry_t entry = pmd_to_swp_entry(pmd);
   1473			unsigned long offset;
   1474
   1475			if (pm->show_pfn) {
   1476				offset = swp_offset(entry) +
   1477					((addr & ~PMD_MASK) >> PAGE_SHIFT);
   1478				frame = swp_type(entry) |
   1479					(offset << MAX_SWAPFILES_SHIFT);
   1480			}
   1481			flags |= PM_SWAP;
   1482			if (pmd_swp_soft_dirty(pmd))
   1483				flags |= PM_SOFT_DIRTY;
   1484			if (pmd_swp_uffd_wp(pmd))
   1485				flags |= PM_UFFD_WP;
   1486			VM_BUG_ON(!is_pmd_migration_entry(pmd));
   1487			migration = is_migration_entry(entry);
   1488			page = pfn_swap_entry_to_page(entry);
   1489		}
   1490#endif
   1491
   1492		if (page && !migration && page_mapcount(page) == 1)
   1493			flags |= PM_MMAP_EXCLUSIVE;
   1494
   1495		for (; addr != end; addr += PAGE_SIZE) {
   1496			pagemap_entry_t pme = make_pme(frame, flags);
   1497
   1498			err = add_to_pagemap(addr, &pme, pm);
   1499			if (err)
   1500				break;
   1501			if (pm->show_pfn) {
   1502				if (flags & PM_PRESENT)
   1503					frame++;
   1504				else if (flags & PM_SWAP)
   1505					frame += (1 << MAX_SWAPFILES_SHIFT);
   1506			}
   1507		}
   1508		spin_unlock(ptl);
   1509		return err;
   1510	}
   1511
   1512	if (pmd_trans_unstable(pmdp))
   1513		return 0;
   1514#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   1515
   1516	/*
   1517	 * We can assume that @vma always points to a valid one and @end never
   1518	 * goes beyond vma->vm_end.
   1519	 */
   1520	orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
   1521	for (; addr < end; pte++, addr += PAGE_SIZE) {
   1522		pagemap_entry_t pme;
   1523
   1524		pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
   1525		err = add_to_pagemap(addr, &pme, pm);
   1526		if (err)
   1527			break;
   1528	}
   1529	pte_unmap_unlock(orig_pte, ptl);
   1530
   1531	cond_resched();
   1532
   1533	return err;
   1534}
   1535
   1536#ifdef CONFIG_HUGETLB_PAGE
   1537/* This function walks within one hugetlb entry in the single call */
   1538static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
   1539				 unsigned long addr, unsigned long end,
   1540				 struct mm_walk *walk)
   1541{
   1542	struct pagemapread *pm = walk->private;
   1543	struct vm_area_struct *vma = walk->vma;
   1544	u64 flags = 0, frame = 0;
   1545	int err = 0;
   1546	pte_t pte;
   1547
   1548	if (vma->vm_flags & VM_SOFTDIRTY)
   1549		flags |= PM_SOFT_DIRTY;
   1550
   1551	pte = huge_ptep_get(ptep);
   1552	if (pte_present(pte)) {
   1553		struct page *page = pte_page(pte);
   1554
   1555		if (!PageAnon(page))
   1556			flags |= PM_FILE;
   1557
   1558		if (page_mapcount(page) == 1)
   1559			flags |= PM_MMAP_EXCLUSIVE;
   1560
   1561		if (huge_pte_uffd_wp(pte))
   1562			flags |= PM_UFFD_WP;
   1563
   1564		flags |= PM_PRESENT;
   1565		if (pm->show_pfn)
   1566			frame = pte_pfn(pte) +
   1567				((addr & ~hmask) >> PAGE_SHIFT);
   1568	} else if (pte_swp_uffd_wp_any(pte)) {
   1569		flags |= PM_UFFD_WP;
   1570	}
   1571
   1572	for (; addr != end; addr += PAGE_SIZE) {
   1573		pagemap_entry_t pme = make_pme(frame, flags);
   1574
   1575		err = add_to_pagemap(addr, &pme, pm);
   1576		if (err)
   1577			return err;
   1578		if (pm->show_pfn && (flags & PM_PRESENT))
   1579			frame++;
   1580	}
   1581
   1582	cond_resched();
   1583
   1584	return err;
   1585}
   1586#else
   1587#define pagemap_hugetlb_range	NULL
   1588#endif /* HUGETLB_PAGE */
   1589
   1590static const struct mm_walk_ops pagemap_ops = {
   1591	.pmd_entry	= pagemap_pmd_range,
   1592	.pte_hole	= pagemap_pte_hole,
   1593	.hugetlb_entry	= pagemap_hugetlb_range,
   1594};
   1595
   1596/*
   1597 * /proc/pid/pagemap - an array mapping virtual pages to pfns
   1598 *
   1599 * For each page in the address space, this file contains one 64-bit entry
   1600 * consisting of the following:
   1601 *
   1602 * Bits 0-54  page frame number (PFN) if present
   1603 * Bits 0-4   swap type if swapped
   1604 * Bits 5-54  swap offset if swapped
   1605 * Bit  55    pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
   1606 * Bit  56    page exclusively mapped
   1607 * Bit  57    pte is uffd-wp write-protected
   1608 * Bits 58-60 zero
   1609 * Bit  61    page is file-page or shared-anon
   1610 * Bit  62    page swapped
   1611 * Bit  63    page present
   1612 *
   1613 * If the page is not present but in swap, then the PFN contains an
   1614 * encoding of the swap file number and the page's offset into the
   1615 * swap. Unmapped pages return a null PFN. This allows determining
   1616 * precisely which pages are mapped (or in swap) and comparing mapped
   1617 * pages between processes.
   1618 *
   1619 * Efficient users of this interface will use /proc/pid/maps to
   1620 * determine which areas of memory are actually mapped and llseek to
   1621 * skip over unmapped regions.
   1622 */
   1623static ssize_t pagemap_read(struct file *file, char __user *buf,
   1624			    size_t count, loff_t *ppos)
   1625{
   1626	struct mm_struct *mm = file->private_data;
   1627	struct pagemapread pm;
   1628	unsigned long src;
   1629	unsigned long svpfn;
   1630	unsigned long start_vaddr;
   1631	unsigned long end_vaddr;
   1632	int ret = 0, copied = 0;
   1633
   1634	if (!mm || !mmget_not_zero(mm))
   1635		goto out;
   1636
   1637	ret = -EINVAL;
   1638	/* file position must be aligned */
   1639	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
   1640		goto out_mm;
   1641
   1642	ret = 0;
   1643	if (!count)
   1644		goto out_mm;
   1645
   1646	/* do not disclose physical addresses: attack vector */
   1647	pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
   1648
   1649	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
   1650	pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
   1651	ret = -ENOMEM;
   1652	if (!pm.buffer)
   1653		goto out_mm;
   1654
   1655	src = *ppos;
   1656	svpfn = src / PM_ENTRY_BYTES;
   1657	end_vaddr = mm->task_size;
   1658
   1659	/* watch out for wraparound */
   1660	start_vaddr = end_vaddr;
   1661	if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
   1662		start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
   1663
   1664	/* Ensure the address is inside the task */
   1665	if (start_vaddr > mm->task_size)
   1666		start_vaddr = end_vaddr;
   1667
   1668	/*
   1669	 * The odds are that this will stop walking way
   1670	 * before end_vaddr, because the length of the
   1671	 * user buffer is tracked in "pm", and the walk
   1672	 * will stop when we hit the end of the buffer.
   1673	 */
   1674	ret = 0;
   1675	while (count && (start_vaddr < end_vaddr)) {
   1676		int len;
   1677		unsigned long end;
   1678
   1679		pm.pos = 0;
   1680		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
   1681		/* overflow ? */
   1682		if (end < start_vaddr || end > end_vaddr)
   1683			end = end_vaddr;
   1684		ret = mmap_read_lock_killable(mm);
   1685		if (ret)
   1686			goto out_free;
   1687		ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
   1688		mmap_read_unlock(mm);
   1689		start_vaddr = end;
   1690
   1691		len = min(count, PM_ENTRY_BYTES * pm.pos);
   1692		if (copy_to_user(buf, pm.buffer, len)) {
   1693			ret = -EFAULT;
   1694			goto out_free;
   1695		}
   1696		copied += len;
   1697		buf += len;
   1698		count -= len;
   1699	}
   1700	*ppos += copied;
   1701	if (!ret || ret == PM_END_OF_BUFFER)
   1702		ret = copied;
   1703
   1704out_free:
   1705	kfree(pm.buffer);
   1706out_mm:
   1707	mmput(mm);
   1708out:
   1709	return ret;
   1710}
   1711
   1712static int pagemap_open(struct inode *inode, struct file *file)
   1713{
   1714	struct mm_struct *mm;
   1715
   1716	mm = proc_mem_open(inode, PTRACE_MODE_READ);
   1717	if (IS_ERR(mm))
   1718		return PTR_ERR(mm);
   1719	file->private_data = mm;
   1720	return 0;
   1721}
   1722
   1723static int pagemap_release(struct inode *inode, struct file *file)
   1724{
   1725	struct mm_struct *mm = file->private_data;
   1726
   1727	if (mm)
   1728		mmdrop(mm);
   1729	return 0;
   1730}
   1731
   1732const struct file_operations proc_pagemap_operations = {
   1733	.llseek		= mem_lseek, /* borrow this */
   1734	.read		= pagemap_read,
   1735	.open		= pagemap_open,
   1736	.release	= pagemap_release,
   1737};
   1738#endif /* CONFIG_PROC_PAGE_MONITOR */
   1739
   1740#ifdef CONFIG_NUMA
   1741
   1742struct numa_maps {
   1743	unsigned long pages;
   1744	unsigned long anon;
   1745	unsigned long active;
   1746	unsigned long writeback;
   1747	unsigned long mapcount_max;
   1748	unsigned long dirty;
   1749	unsigned long swapcache;
   1750	unsigned long node[MAX_NUMNODES];
   1751};
   1752
   1753struct numa_maps_private {
   1754	struct proc_maps_private proc_maps;
   1755	struct numa_maps md;
   1756};
   1757
   1758static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
   1759			unsigned long nr_pages)
   1760{
   1761	int count = page_mapcount(page);
   1762
   1763	md->pages += nr_pages;
   1764	if (pte_dirty || PageDirty(page))
   1765		md->dirty += nr_pages;
   1766
   1767	if (PageSwapCache(page))
   1768		md->swapcache += nr_pages;
   1769
   1770	if (PageActive(page) || PageUnevictable(page))
   1771		md->active += nr_pages;
   1772
   1773	if (PageWriteback(page))
   1774		md->writeback += nr_pages;
   1775
   1776	if (PageAnon(page))
   1777		md->anon += nr_pages;
   1778
   1779	if (count > md->mapcount_max)
   1780		md->mapcount_max = count;
   1781
   1782	md->node[page_to_nid(page)] += nr_pages;
   1783}
   1784
   1785static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
   1786		unsigned long addr)
   1787{
   1788	struct page *page;
   1789	int nid;
   1790
   1791	if (!pte_present(pte))
   1792		return NULL;
   1793
   1794	page = vm_normal_page(vma, addr, pte);
   1795	if (!page)
   1796		return NULL;
   1797
   1798	if (PageReserved(page))
   1799		return NULL;
   1800
   1801	nid = page_to_nid(page);
   1802	if (!node_isset(nid, node_states[N_MEMORY]))
   1803		return NULL;
   1804
   1805	return page;
   1806}
   1807
   1808#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1809static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
   1810					      struct vm_area_struct *vma,
   1811					      unsigned long addr)
   1812{
   1813	struct page *page;
   1814	int nid;
   1815
   1816	if (!pmd_present(pmd))
   1817		return NULL;
   1818
   1819	page = vm_normal_page_pmd(vma, addr, pmd);
   1820	if (!page)
   1821		return NULL;
   1822
   1823	if (PageReserved(page))
   1824		return NULL;
   1825
   1826	nid = page_to_nid(page);
   1827	if (!node_isset(nid, node_states[N_MEMORY]))
   1828		return NULL;
   1829
   1830	return page;
   1831}
   1832#endif
   1833
   1834static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
   1835		unsigned long end, struct mm_walk *walk)
   1836{
   1837	struct numa_maps *md = walk->private;
   1838	struct vm_area_struct *vma = walk->vma;
   1839	spinlock_t *ptl;
   1840	pte_t *orig_pte;
   1841	pte_t *pte;
   1842
   1843#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1844	ptl = pmd_trans_huge_lock(pmd, vma);
   1845	if (ptl) {
   1846		struct page *page;
   1847
   1848		page = can_gather_numa_stats_pmd(*pmd, vma, addr);
   1849		if (page)
   1850			gather_stats(page, md, pmd_dirty(*pmd),
   1851				     HPAGE_PMD_SIZE/PAGE_SIZE);
   1852		spin_unlock(ptl);
   1853		return 0;
   1854	}
   1855
   1856	if (pmd_trans_unstable(pmd))
   1857		return 0;
   1858#endif
   1859	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
   1860	do {
   1861		struct page *page = can_gather_numa_stats(*pte, vma, addr);
   1862		if (!page)
   1863			continue;
   1864		gather_stats(page, md, pte_dirty(*pte), 1);
   1865
   1866	} while (pte++, addr += PAGE_SIZE, addr != end);
   1867	pte_unmap_unlock(orig_pte, ptl);
   1868	cond_resched();
   1869	return 0;
   1870}
   1871#ifdef CONFIG_HUGETLB_PAGE
   1872static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
   1873		unsigned long addr, unsigned long end, struct mm_walk *walk)
   1874{
   1875	pte_t huge_pte = huge_ptep_get(pte);
   1876	struct numa_maps *md;
   1877	struct page *page;
   1878
   1879	if (!pte_present(huge_pte))
   1880		return 0;
   1881
   1882	page = pte_page(huge_pte);
   1883
   1884	md = walk->private;
   1885	gather_stats(page, md, pte_dirty(huge_pte), 1);
   1886	return 0;
   1887}
   1888
   1889#else
   1890static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
   1891		unsigned long addr, unsigned long end, struct mm_walk *walk)
   1892{
   1893	return 0;
   1894}
   1895#endif
   1896
   1897static const struct mm_walk_ops show_numa_ops = {
   1898	.hugetlb_entry = gather_hugetlb_stats,
   1899	.pmd_entry = gather_pte_stats,
   1900};
   1901
   1902/*
   1903 * Display pages allocated per node and memory policy via /proc.
   1904 */
   1905static int show_numa_map(struct seq_file *m, void *v)
   1906{
   1907	struct numa_maps_private *numa_priv = m->private;
   1908	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
   1909	struct vm_area_struct *vma = v;
   1910	struct numa_maps *md = &numa_priv->md;
   1911	struct file *file = vma->vm_file;
   1912	struct mm_struct *mm = vma->vm_mm;
   1913	struct mempolicy *pol;
   1914	char buffer[64];
   1915	int nid;
   1916
   1917	if (!mm)
   1918		return 0;
   1919
   1920	/* Ensure we start with an empty set of numa_maps statistics. */
   1921	memset(md, 0, sizeof(*md));
   1922
   1923	pol = __get_vma_policy(vma, vma->vm_start);
   1924	if (pol) {
   1925		mpol_to_str(buffer, sizeof(buffer), pol);
   1926		mpol_cond_put(pol);
   1927	} else {
   1928		mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
   1929	}
   1930
   1931	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
   1932
   1933	if (file) {
   1934		seq_puts(m, " file=");
   1935		seq_file_path(m, file, "\n\t= ");
   1936	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
   1937		seq_puts(m, " heap");
   1938	} else if (is_stack(vma)) {
   1939		seq_puts(m, " stack");
   1940	}
   1941
   1942	if (is_vm_hugetlb_page(vma))
   1943		seq_puts(m, " huge");
   1944
   1945	/* mmap_lock is held by m_start */
   1946	walk_page_vma(vma, &show_numa_ops, md);
   1947
   1948	if (!md->pages)
   1949		goto out;
   1950
   1951	if (md->anon)
   1952		seq_printf(m, " anon=%lu", md->anon);
   1953
   1954	if (md->dirty)
   1955		seq_printf(m, " dirty=%lu", md->dirty);
   1956
   1957	if (md->pages != md->anon && md->pages != md->dirty)
   1958		seq_printf(m, " mapped=%lu", md->pages);
   1959
   1960	if (md->mapcount_max > 1)
   1961		seq_printf(m, " mapmax=%lu", md->mapcount_max);
   1962
   1963	if (md->swapcache)
   1964		seq_printf(m, " swapcache=%lu", md->swapcache);
   1965
   1966	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
   1967		seq_printf(m, " active=%lu", md->active);
   1968
   1969	if (md->writeback)
   1970		seq_printf(m, " writeback=%lu", md->writeback);
   1971
   1972	for_each_node_state(nid, N_MEMORY)
   1973		if (md->node[nid])
   1974			seq_printf(m, " N%d=%lu", nid, md->node[nid]);
   1975
   1976	seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
   1977out:
   1978	seq_putc(m, '\n');
   1979	return 0;
   1980}
   1981
   1982static const struct seq_operations proc_pid_numa_maps_op = {
   1983	.start  = m_start,
   1984	.next   = m_next,
   1985	.stop   = m_stop,
   1986	.show   = show_numa_map,
   1987};
   1988
   1989static int pid_numa_maps_open(struct inode *inode, struct file *file)
   1990{
   1991	return proc_maps_open(inode, file, &proc_pid_numa_maps_op,
   1992				sizeof(struct numa_maps_private));
   1993}
   1994
   1995const struct file_operations proc_pid_numa_maps_operations = {
   1996	.open		= pid_numa_maps_open,
   1997	.read		= seq_read,
   1998	.llseek		= seq_lseek,
   1999	.release	= proc_map_release,
   2000};
   2001
   2002#endif /* CONFIG_NUMA */