cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

page-types.c (31133B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * page-types: Tool for querying page flags
      4 *
      5 * Copyright (C) 2009 Intel corporation
      6 *
      7 * Authors: Wu Fengguang <fengguang.wu@intel.com>
      8 */
      9
     10#define _FILE_OFFSET_BITS 64
     11#define _GNU_SOURCE
     12#include <stdio.h>
     13#include <stdlib.h>
     14#include <unistd.h>
     15#include <stdint.h>
     16#include <stdarg.h>
     17#include <string.h>
     18#include <getopt.h>
     19#include <limits.h>
     20#include <assert.h>
     21#include <ftw.h>
     22#include <time.h>
     23#include <setjmp.h>
     24#include <signal.h>
     25#include <sys/types.h>
     26#include <sys/errno.h>
     27#include <sys/fcntl.h>
     28#include <sys/mount.h>
     29#include <sys/statfs.h>
     30#include <sys/mman.h>
     31#include "../../include/uapi/linux/magic.h"
     32#include "../../include/uapi/linux/kernel-page-flags.h"
     33#include <api/fs/fs.h>
     34
     35#ifndef MAX_PATH
     36# define MAX_PATH 256
     37#endif
     38
     39#ifndef STR
     40# define _STR(x) #x
     41# define STR(x) _STR(x)
     42#endif
     43
     44/*
     45 * pagemap kernel ABI bits
     46 */
     47
     48#define PM_ENTRY_BYTES		8
     49#define PM_PFRAME_BITS		55
     50#define PM_PFRAME_MASK		((1LL << PM_PFRAME_BITS) - 1)
     51#define PM_PFRAME(x)		((x) & PM_PFRAME_MASK)
     52#define MAX_SWAPFILES_SHIFT	5
     53#define PM_SWAP_OFFSET(x)	(((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT)
     54#define PM_SOFT_DIRTY		(1ULL << 55)
     55#define PM_MMAP_EXCLUSIVE	(1ULL << 56)
     56#define PM_FILE			(1ULL << 61)
     57#define PM_SWAP			(1ULL << 62)
     58#define PM_PRESENT		(1ULL << 63)
     59
     60/*
     61 * kernel page flags
     62 */
     63
     64#define KPF_BYTES		8
     65#define PROC_KPAGEFLAGS		"/proc/kpageflags"
     66#define PROC_KPAGECOUNT		"/proc/kpagecount"
     67#define PROC_KPAGECGROUP	"/proc/kpagecgroup"
     68
     69#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap"
     70
     71/* [32-] kernel hacking assistances */
     72#define KPF_RESERVED		32
     73#define KPF_MLOCKED		33
     74#define KPF_MAPPEDTODISK	34
     75#define KPF_PRIVATE		35
     76#define KPF_PRIVATE_2		36
     77#define KPF_OWNER_PRIVATE	37
     78#define KPF_ARCH		38
     79#define KPF_UNCACHED		39
     80#define KPF_SOFTDIRTY		40
     81#define KPF_ARCH_2		41
     82
     83/* [47-] take some arbitrary free slots for expanding overloaded flags
     84 * not part of kernel API
     85 */
     86#define KPF_ANON_EXCLUSIVE	47
     87#define KPF_READAHEAD		48
     88#define KPF_SLOB_FREE		49
     89#define KPF_SLUB_FROZEN		50
     90#define KPF_SLUB_DEBUG		51
     91#define KPF_FILE		61
     92#define KPF_SWAP		62
     93#define KPF_MMAP_EXCLUSIVE	63
     94
     95#define KPF_ALL_BITS		((uint64_t)~0ULL)
     96#define KPF_HACKERS_BITS	(0xffffULL << 32)
     97#define KPF_OVERLOADED_BITS	(0xffffULL << 48)
     98#define BIT(name)		(1ULL << KPF_##name)
     99#define BITS_COMPOUND		(BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
    100
    101static const char * const page_flag_names[] = {
    102	[KPF_LOCKED]		= "L:locked",
    103	[KPF_ERROR]		= "E:error",
    104	[KPF_REFERENCED]	= "R:referenced",
    105	[KPF_UPTODATE]		= "U:uptodate",
    106	[KPF_DIRTY]		= "D:dirty",
    107	[KPF_LRU]		= "l:lru",
    108	[KPF_ACTIVE]		= "A:active",
    109	[KPF_SLAB]		= "S:slab",
    110	[KPF_WRITEBACK]		= "W:writeback",
    111	[KPF_RECLAIM]		= "I:reclaim",
    112	[KPF_BUDDY]		= "B:buddy",
    113
    114	[KPF_MMAP]		= "M:mmap",
    115	[KPF_ANON]		= "a:anonymous",
    116	[KPF_SWAPCACHE]		= "s:swapcache",
    117	[KPF_SWAPBACKED]	= "b:swapbacked",
    118	[KPF_COMPOUND_HEAD]	= "H:compound_head",
    119	[KPF_COMPOUND_TAIL]	= "T:compound_tail",
    120	[KPF_HUGE]		= "G:huge",
    121	[KPF_UNEVICTABLE]	= "u:unevictable",
    122	[KPF_HWPOISON]		= "X:hwpoison",
    123	[KPF_NOPAGE]		= "n:nopage",
    124	[KPF_KSM]		= "x:ksm",
    125	[KPF_THP]		= "t:thp",
    126	[KPF_OFFLINE]		= "o:offline",
    127	[KPF_PGTABLE]		= "g:pgtable",
    128	[KPF_ZERO_PAGE]		= "z:zero_page",
    129	[KPF_IDLE]              = "i:idle_page",
    130
    131	[KPF_RESERVED]		= "r:reserved",
    132	[KPF_MLOCKED]		= "m:mlocked",
    133	[KPF_MAPPEDTODISK]	= "d:mappedtodisk",
    134	[KPF_PRIVATE]		= "P:private",
    135	[KPF_PRIVATE_2]		= "p:private_2",
    136	[KPF_OWNER_PRIVATE]	= "O:owner_private",
    137	[KPF_ARCH]		= "h:arch",
    138	[KPF_UNCACHED]		= "c:uncached",
    139	[KPF_SOFTDIRTY]		= "f:softdirty",
    140	[KPF_ARCH_2]		= "H:arch_2",
    141
    142	[KPF_ANON_EXCLUSIVE]	= "d:anon_exclusive",
    143	[KPF_READAHEAD]		= "I:readahead",
    144	[KPF_SLOB_FREE]		= "P:slob_free",
    145	[KPF_SLUB_FROZEN]	= "A:slub_frozen",
    146	[KPF_SLUB_DEBUG]	= "E:slub_debug",
    147
    148	[KPF_FILE]		= "F:file",
    149	[KPF_SWAP]		= "w:swap",
    150	[KPF_MMAP_EXCLUSIVE]	= "1:mmap_exclusive",
    151};
    152
    153
    154/*
    155 * data structures
    156 */
    157
    158static int		opt_raw;	/* for kernel developers */
    159static int		opt_list;	/* list pages (in ranges) */
    160static int		opt_mark_idle;	/* set accessed bit */
    161static int		opt_no_summary;	/* don't show summary */
    162static pid_t		opt_pid;	/* process to walk */
    163const char		*opt_file;	/* file or directory path */
    164static uint64_t		opt_cgroup;	/* cgroup inode */
    165static int		opt_list_cgroup;/* list page cgroup */
    166static int		opt_list_mapcnt;/* list page map count */
    167static const char	*opt_kpageflags;/* kpageflags file to parse */
    168
    169#define MAX_ADDR_RANGES	1024
    170static int		nr_addr_ranges;
    171static unsigned long	opt_offset[MAX_ADDR_RANGES];
    172static unsigned long	opt_size[MAX_ADDR_RANGES];
    173
    174#define MAX_VMAS	10240
    175static int		nr_vmas;
    176static unsigned long	pg_start[MAX_VMAS];
    177static unsigned long	pg_end[MAX_VMAS];
    178
    179#define MAX_BIT_FILTERS	64
    180static int		nr_bit_filters;
    181static uint64_t		opt_mask[MAX_BIT_FILTERS];
    182static uint64_t		opt_bits[MAX_BIT_FILTERS];
    183
    184static int		page_size;
    185
    186static int		pagemap_fd;
    187static int		kpageflags_fd;
    188static int		kpagecount_fd = -1;
    189static int		kpagecgroup_fd = -1;
    190static int		page_idle_fd = -1;
    191
    192static int		opt_hwpoison;
    193static int		opt_unpoison;
    194
    195static const char	*hwpoison_debug_fs;
    196static int		hwpoison_inject_fd;
    197static int		hwpoison_forget_fd;
    198
    199#define HASH_SHIFT	13
    200#define HASH_SIZE	(1 << HASH_SHIFT)
    201#define HASH_MASK	(HASH_SIZE - 1)
    202#define HASH_KEY(flags)	(flags & HASH_MASK)
    203
    204static unsigned long	total_pages;
    205static unsigned long	nr_pages[HASH_SIZE];
    206static uint64_t		page_flags[HASH_SIZE];
    207
    208
    209/*
    210 * helper functions
    211 */
    212
    213#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
    214
    215#define min_t(type, x, y) ({			\
    216	type __min1 = (x);			\
    217	type __min2 = (y);			\
    218	__min1 < __min2 ? __min1 : __min2; })
    219
    220#define max_t(type, x, y) ({			\
    221	type __max1 = (x);			\
    222	type __max2 = (y);			\
    223	__max1 > __max2 ? __max1 : __max2; })
    224
    225static unsigned long pages2mb(unsigned long pages)
    226{
    227	return (pages * page_size) >> 20;
    228}
    229
    230static void fatal(const char *x, ...)
    231{
    232	va_list ap;
    233
    234	va_start(ap, x);
    235	vfprintf(stderr, x, ap);
    236	va_end(ap);
    237	exit(EXIT_FAILURE);
    238}
    239
    240static int checked_open(const char *pathname, int flags)
    241{
    242	int fd = open(pathname, flags);
    243
    244	if (fd < 0) {
    245		perror(pathname);
    246		exit(EXIT_FAILURE);
    247	}
    248
    249	return fd;
    250}
    251
    252/*
    253 * pagemap/kpageflags routines
    254 */
    255
    256static unsigned long do_u64_read(int fd, const char *name,
    257				 uint64_t *buf,
    258				 unsigned long index,
    259				 unsigned long count)
    260{
    261	long bytes;
    262
    263	if (index > ULONG_MAX / 8)
    264		fatal("index overflow: %lu\n", index);
    265
    266	bytes = pread(fd, buf, count * 8, (off_t)index * 8);
    267	if (bytes < 0) {
    268		perror(name);
    269		exit(EXIT_FAILURE);
    270	}
    271	if (bytes % 8)
    272		fatal("partial read: %lu bytes\n", bytes);
    273
    274	return bytes / 8;
    275}
    276
    277static unsigned long kpageflags_read(uint64_t *buf,
    278				     unsigned long index,
    279				     unsigned long pages)
    280{
    281	return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages);
    282}
    283
    284static unsigned long kpagecgroup_read(uint64_t *buf,
    285				      unsigned long index,
    286				      unsigned long pages)
    287{
    288	if (kpagecgroup_fd < 0)
    289		return pages;
    290
    291	return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages);
    292}
    293
    294static unsigned long kpagecount_read(uint64_t *buf,
    295				     unsigned long index,
    296				     unsigned long pages)
    297{
    298	return kpagecount_fd < 0 ? pages :
    299		do_u64_read(kpagecount_fd, PROC_KPAGECOUNT,
    300			    buf, index, pages);
    301}
    302
    303static unsigned long pagemap_read(uint64_t *buf,
    304				  unsigned long index,
    305				  unsigned long pages)
    306{
    307	return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages);
    308}
    309
    310static unsigned long pagemap_pfn(uint64_t val)
    311{
    312	unsigned long pfn;
    313
    314	if (val & PM_PRESENT)
    315		pfn = PM_PFRAME(val);
    316	else
    317		pfn = 0;
    318
    319	return pfn;
    320}
    321
    322static unsigned long pagemap_swap_offset(uint64_t val)
    323{
    324	return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0;
    325}
    326
    327/*
    328 * page flag names
    329 */
    330
    331static char *page_flag_name(uint64_t flags)
    332{
    333	static char buf[65];
    334	int present;
    335	size_t i, j;
    336
    337	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
    338		present = (flags >> i) & 1;
    339		if (!page_flag_names[i]) {
    340			if (present)
    341				fatal("unknown flag bit %d\n", i);
    342			continue;
    343		}
    344		buf[j++] = present ? page_flag_names[i][0] : '_';
    345	}
    346
    347	return buf;
    348}
    349
    350static char *page_flag_longname(uint64_t flags)
    351{
    352	static char buf[1024];
    353	size_t i, n;
    354
    355	for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
    356		if (!page_flag_names[i])
    357			continue;
    358		if ((flags >> i) & 1)
    359			n += snprintf(buf + n, sizeof(buf) - n, "%s,",
    360					page_flag_names[i] + 2);
    361	}
    362	if (n)
    363		n--;
    364	buf[n] = '\0';
    365
    366	return buf;
    367}
    368
    369
    370/*
    371 * page list and summary
    372 */
    373
    374static void show_page_range(unsigned long voffset, unsigned long offset,
    375			    unsigned long size, uint64_t flags,
    376			    uint64_t cgroup, uint64_t mapcnt)
    377{
    378	static uint64_t      flags0;
    379	static uint64_t	     cgroup0;
    380	static uint64_t      mapcnt0;
    381	static unsigned long voff;
    382	static unsigned long index;
    383	static unsigned long count;
    384
    385	if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 &&
    386	    offset == index + count && size && voffset == voff + count) {
    387		count += size;
    388		return;
    389	}
    390
    391	if (count) {
    392		if (opt_pid)
    393			printf("%lx\t", voff);
    394		if (opt_file)
    395			printf("%lx\t", voff);
    396		if (opt_list_cgroup)
    397			printf("@%llu\t", (unsigned long long)cgroup0);
    398		if (opt_list_mapcnt)
    399			printf("%lu\t", mapcnt0);
    400		printf("%lx\t%lx\t%s\n",
    401				index, count, page_flag_name(flags0));
    402	}
    403
    404	flags0 = flags;
    405	cgroup0 = cgroup;
    406	mapcnt0 = mapcnt;
    407	index  = offset;
    408	voff   = voffset;
    409	count  = size;
    410}
    411
    412static void flush_page_range(void)
    413{
    414	show_page_range(0, 0, 0, 0, 0, 0);
    415}
    416
    417static void show_page(unsigned long voffset, unsigned long offset,
    418		      uint64_t flags, uint64_t cgroup, uint64_t mapcnt)
    419{
    420	if (opt_pid)
    421		printf("%lx\t", voffset);
    422	if (opt_file)
    423		printf("%lx\t", voffset);
    424	if (opt_list_cgroup)
    425		printf("@%llu\t", (unsigned long long)cgroup);
    426	if (opt_list_mapcnt)
    427		printf("%lu\t", mapcnt);
    428
    429	printf("%lx\t%s\n", offset, page_flag_name(flags));
    430}
    431
    432static void show_summary(void)
    433{
    434	size_t i;
    435
    436	printf("             flags\tpage-count       MB"
    437		"  symbolic-flags\t\t\tlong-symbolic-flags\n");
    438
    439	for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
    440		if (nr_pages[i])
    441			printf("0x%016llx\t%10lu %8lu  %s\t%s\n",
    442				(unsigned long long)page_flags[i],
    443				nr_pages[i],
    444				pages2mb(nr_pages[i]),
    445				page_flag_name(page_flags[i]),
    446				page_flag_longname(page_flags[i]));
    447	}
    448
    449	printf("             total\t%10lu %8lu\n",
    450			total_pages, pages2mb(total_pages));
    451}
    452
    453
    454/*
    455 * page flag filters
    456 */
    457
    458static int bit_mask_ok(uint64_t flags)
    459{
    460	int i;
    461
    462	for (i = 0; i < nr_bit_filters; i++) {
    463		if (opt_bits[i] == KPF_ALL_BITS) {
    464			if ((flags & opt_mask[i]) == 0)
    465				return 0;
    466		} else {
    467			if ((flags & opt_mask[i]) != opt_bits[i])
    468				return 0;
    469		}
    470	}
    471
    472	return 1;
    473}
    474
    475static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
    476{
    477	/* Anonymous pages overload PG_mappedtodisk */
    478	if ((flags & BIT(ANON)) && (flags & BIT(MAPPEDTODISK)))
    479		flags ^= BIT(MAPPEDTODISK) | BIT(ANON_EXCLUSIVE);
    480
    481	/* SLOB/SLUB overload several page flags */
    482	if (flags & BIT(SLAB)) {
    483		if (flags & BIT(PRIVATE))
    484			flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
    485		if (flags & BIT(ACTIVE))
    486			flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
    487		if (flags & BIT(ERROR))
    488			flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
    489	}
    490
    491	/* PG_reclaim is overloaded as PG_readahead in the read path */
    492	if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
    493		flags ^= BIT(RECLAIM) | BIT(READAHEAD);
    494
    495	if (pme & PM_SOFT_DIRTY)
    496		flags |= BIT(SOFTDIRTY);
    497	if (pme & PM_FILE)
    498		flags |= BIT(FILE);
    499	if (pme & PM_SWAP)
    500		flags |= BIT(SWAP);
    501	if (pme & PM_MMAP_EXCLUSIVE)
    502		flags |= BIT(MMAP_EXCLUSIVE);
    503
    504	return flags;
    505}
    506
    507static uint64_t well_known_flags(uint64_t flags)
    508{
    509	/* hide flags intended only for kernel hacker */
    510	flags &= ~KPF_HACKERS_BITS;
    511
    512	/* hide non-hugeTLB compound pages */
    513	if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
    514		flags &= ~BITS_COMPOUND;
    515
    516	return flags;
    517}
    518
    519static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme)
    520{
    521	if (opt_raw)
    522		flags = expand_overloaded_flags(flags, pme);
    523	else
    524		flags = well_known_flags(flags);
    525
    526	return flags;
    527}
    528
    529/*
    530 * page actions
    531 */
    532
    533static void prepare_hwpoison_fd(void)
    534{
    535	char buf[MAX_PATH + 1];
    536
    537	hwpoison_debug_fs = debugfs__mount();
    538	if (!hwpoison_debug_fs) {
    539		perror("mount debugfs");
    540		exit(EXIT_FAILURE);
    541	}
    542
    543	if (opt_hwpoison && !hwpoison_inject_fd) {
    544		snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn",
    545			hwpoison_debug_fs);
    546		hwpoison_inject_fd = checked_open(buf, O_WRONLY);
    547	}
    548
    549	if (opt_unpoison && !hwpoison_forget_fd) {
    550		snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn",
    551			hwpoison_debug_fs);
    552		hwpoison_forget_fd = checked_open(buf, O_WRONLY);
    553	}
    554}
    555
    556static int hwpoison_page(unsigned long offset)
    557{
    558	char buf[100];
    559	int len;
    560
    561	len = sprintf(buf, "0x%lx\n", offset);
    562	len = write(hwpoison_inject_fd, buf, len);
    563	if (len < 0) {
    564		perror("hwpoison inject");
    565		return len;
    566	}
    567	return 0;
    568}
    569
    570static int unpoison_page(unsigned long offset)
    571{
    572	char buf[100];
    573	int len;
    574
    575	len = sprintf(buf, "0x%lx\n", offset);
    576	len = write(hwpoison_forget_fd, buf, len);
    577	if (len < 0) {
    578		perror("hwpoison forget");
    579		return len;
    580	}
    581	return 0;
    582}
    583
    584static int mark_page_idle(unsigned long offset)
    585{
    586	static unsigned long off;
    587	static uint64_t buf;
    588	int len;
    589
    590	if ((offset / 64 == off / 64) || buf == 0) {
    591		buf |= 1UL << (offset % 64);
    592		off = offset;
    593		return 0;
    594	}
    595
    596	len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64));
    597	if (len < 0) {
    598		perror("mark page idle");
    599		return len;
    600	}
    601
    602	buf = 1UL << (offset % 64);
    603	off = offset;
    604
    605	return 0;
    606}
    607
    608/*
    609 * page frame walker
    610 */
    611
    612static size_t hash_slot(uint64_t flags)
    613{
    614	size_t k = HASH_KEY(flags);
    615	size_t i;
    616
    617	/* Explicitly reserve slot 0 for flags 0: the following logic
    618	 * cannot distinguish an unoccupied slot from slot (flags==0).
    619	 */
    620	if (flags == 0)
    621		return 0;
    622
    623	/* search through the remaining (HASH_SIZE-1) slots */
    624	for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
    625		if (!k || k >= ARRAY_SIZE(page_flags))
    626			k = 1;
    627		if (page_flags[k] == 0) {
    628			page_flags[k] = flags;
    629			return k;
    630		}
    631		if (page_flags[k] == flags)
    632			return k;
    633	}
    634
    635	fatal("hash table full: bump up HASH_SHIFT?\n");
    636	exit(EXIT_FAILURE);
    637}
    638
    639static void add_page(unsigned long voffset, unsigned long offset,
    640		     uint64_t flags, uint64_t cgroup, uint64_t mapcnt,
    641		     uint64_t pme)
    642{
    643	flags = kpageflags_flags(flags, pme);
    644
    645	if (!bit_mask_ok(flags))
    646		return;
    647
    648	if (opt_cgroup && cgroup != (uint64_t)opt_cgroup)
    649		return;
    650
    651	if (opt_hwpoison)
    652		hwpoison_page(offset);
    653	if (opt_unpoison)
    654		unpoison_page(offset);
    655
    656	if (opt_mark_idle)
    657		mark_page_idle(offset);
    658
    659	if (opt_list == 1)
    660		show_page_range(voffset, offset, 1, flags, cgroup, mapcnt);
    661	else if (opt_list == 2)
    662		show_page(voffset, offset, flags, cgroup, mapcnt);
    663
    664	nr_pages[hash_slot(flags)]++;
    665	total_pages++;
    666}
    667
    668#define KPAGEFLAGS_BATCH	(64 << 10)	/* 64k pages */
    669static void walk_pfn(unsigned long voffset,
    670		     unsigned long index,
    671		     unsigned long count,
    672		     uint64_t pme)
    673{
    674	uint64_t buf[KPAGEFLAGS_BATCH];
    675	uint64_t cgi[KPAGEFLAGS_BATCH];
    676	uint64_t cnt[KPAGEFLAGS_BATCH];
    677	unsigned long batch;
    678	unsigned long pages;
    679	unsigned long i;
    680
    681	/*
    682	 * kpagecgroup_read() reads only if kpagecgroup were opened, but
    683	 * /proc/kpagecgroup might even not exist, so it's better to fill
    684	 * them with zeros here.
    685	 */
    686	if (count == 1)
    687		cgi[0] = 0;
    688	else
    689		memset(cgi, 0, sizeof cgi);
    690
    691	while (count) {
    692		batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
    693		pages = kpageflags_read(buf, index, batch);
    694		if (pages == 0)
    695			break;
    696
    697		if (kpagecgroup_read(cgi, index, pages) != pages)
    698			fatal("kpagecgroup returned fewer pages than expected");
    699
    700		if (kpagecount_read(cnt, index, pages) != pages)
    701			fatal("kpagecount returned fewer pages than expected");
    702
    703		for (i = 0; i < pages; i++)
    704			add_page(voffset + i, index + i,
    705				 buf[i], cgi[i], cnt[i], pme);
    706
    707		index += pages;
    708		count -= pages;
    709	}
    710}
    711
    712static void walk_swap(unsigned long voffset, uint64_t pme)
    713{
    714	uint64_t flags = kpageflags_flags(0, pme);
    715
    716	if (!bit_mask_ok(flags))
    717		return;
    718
    719	if (opt_cgroup)
    720		return;
    721
    722	if (opt_list == 1)
    723		show_page_range(voffset, pagemap_swap_offset(pme),
    724				1, flags, 0, 0);
    725	else if (opt_list == 2)
    726		show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0);
    727
    728	nr_pages[hash_slot(flags)]++;
    729	total_pages++;
    730}
    731
    732#define PAGEMAP_BATCH	(64 << 10)
    733static void walk_vma(unsigned long index, unsigned long count)
    734{
    735	uint64_t buf[PAGEMAP_BATCH];
    736	unsigned long batch;
    737	unsigned long pages;
    738	unsigned long pfn;
    739	unsigned long i;
    740
    741	while (count) {
    742		batch = min_t(unsigned long, count, PAGEMAP_BATCH);
    743		pages = pagemap_read(buf, index, batch);
    744		if (pages == 0)
    745			break;
    746
    747		for (i = 0; i < pages; i++) {
    748			pfn = pagemap_pfn(buf[i]);
    749			if (pfn)
    750				walk_pfn(index + i, pfn, 1, buf[i]);
    751			if (buf[i] & PM_SWAP)
    752				walk_swap(index + i, buf[i]);
    753		}
    754
    755		index += pages;
    756		count -= pages;
    757	}
    758}
    759
    760static void walk_task(unsigned long index, unsigned long count)
    761{
    762	const unsigned long end = index + count;
    763	unsigned long start;
    764	int i = 0;
    765
    766	while (index < end) {
    767
    768		while (pg_end[i] <= index)
    769			if (++i >= nr_vmas)
    770				return;
    771		if (pg_start[i] >= end)
    772			return;
    773
    774		start = max_t(unsigned long, pg_start[i], index);
    775		index = min_t(unsigned long, pg_end[i], end);
    776
    777		assert(start < index);
    778		walk_vma(start, index - start);
    779	}
    780}
    781
    782static void add_addr_range(unsigned long offset, unsigned long size)
    783{
    784	if (nr_addr_ranges >= MAX_ADDR_RANGES)
    785		fatal("too many addr ranges\n");
    786
    787	opt_offset[nr_addr_ranges] = offset;
    788	opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
    789	nr_addr_ranges++;
    790}
    791
    792static void walk_addr_ranges(void)
    793{
    794	int i;
    795
    796	kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
    797
    798	if (!nr_addr_ranges)
    799		add_addr_range(0, ULONG_MAX);
    800
    801	for (i = 0; i < nr_addr_ranges; i++)
    802		if (!opt_pid)
    803			walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0);
    804		else
    805			walk_task(opt_offset[i], opt_size[i]);
    806
    807	if (opt_mark_idle)
    808		mark_page_idle(0);
    809
    810	close(kpageflags_fd);
    811}
    812
    813
    814/*
    815 * user interface
    816 */
    817
    818static const char *page_flag_type(uint64_t flag)
    819{
    820	if (flag & KPF_HACKERS_BITS)
    821		return "(r)";
    822	if (flag & KPF_OVERLOADED_BITS)
    823		return "(o)";
    824	return "   ";
    825}
    826
    827static void usage(void)
    828{
    829	size_t i, j;
    830
    831	printf(
    832"page-types [options]\n"
    833"            -r|--raw                   Raw mode, for kernel developers\n"
    834"            -d|--describe flags        Describe flags\n"
    835"            -a|--addr    addr-spec     Walk a range of pages\n"
    836"            -b|--bits    bits-spec     Walk pages with specified bits\n"
    837"            -c|--cgroup  path|@inode   Walk pages within memory cgroup\n"
    838"            -p|--pid     pid           Walk process address space\n"
    839"            -f|--file    filename      Walk file address space\n"
    840"            -i|--mark-idle             Mark pages idle\n"
    841"            -l|--list                  Show page details in ranges\n"
    842"            -L|--list-each             Show page details one by one\n"
    843"            -C|--list-cgroup           Show cgroup inode for pages\n"
    844"            -M|--list-mapcnt           Show page map count\n"
    845"            -N|--no-summary            Don't show summary info\n"
    846"            -X|--hwpoison              hwpoison pages\n"
    847"            -x|--unpoison              unpoison pages\n"
    848"            -F|--kpageflags filename   kpageflags file to parse\n"
    849"            -h|--help                  Show this usage message\n"
    850"flags:\n"
    851"            0x10                       bitfield format, e.g.\n"
    852"            anon                       bit-name, e.g.\n"
    853"            0x10,anon                  comma-separated list, e.g.\n"
    854"addr-spec:\n"
    855"            N                          one page at offset N (unit: pages)\n"
    856"            N+M                        pages range from N to N+M-1\n"
    857"            N,M                        pages range from N to M-1\n"
    858"            N,                         pages range from N to end\n"
    859"            ,M                         pages range from 0 to M-1\n"
    860"bits-spec:\n"
    861"            bit1,bit2                  (flags & (bit1|bit2)) != 0\n"
    862"            bit1,bit2=bit1             (flags & (bit1|bit2)) == bit1\n"
    863"            bit1,~bit2                 (flags & (bit1|bit2)) == bit1\n"
    864"            =bit1,bit2                 flags == (bit1|bit2)\n"
    865"bit-names:\n"
    866	);
    867
    868	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
    869		if (!page_flag_names[i])
    870			continue;
    871		printf("%16s%s", page_flag_names[i] + 2,
    872				 page_flag_type(1ULL << i));
    873		if (++j > 3) {
    874			j = 0;
    875			putchar('\n');
    876		}
    877	}
    878	printf("\n                                   "
    879		"(r) raw mode bits  (o) overloaded bits\n");
    880}
    881
    882static unsigned long long parse_number(const char *str)
    883{
    884	unsigned long long n;
    885
    886	n = strtoll(str, NULL, 0);
    887
    888	if (n == 0 && str[0] != '0')
    889		fatal("invalid name or number: %s\n", str);
    890
    891	return n;
    892}
    893
    894static void parse_pid(const char *str)
    895{
    896	FILE *file;
    897	char buf[5000];
    898
    899	opt_pid = parse_number(str);
    900
    901	sprintf(buf, "/proc/%d/pagemap", opt_pid);
    902	pagemap_fd = checked_open(buf, O_RDONLY);
    903
    904	sprintf(buf, "/proc/%d/maps", opt_pid);
    905	file = fopen(buf, "r");
    906	if (!file) {
    907		perror(buf);
    908		exit(EXIT_FAILURE);
    909	}
    910
    911	while (fgets(buf, sizeof(buf), file) != NULL) {
    912		unsigned long vm_start;
    913		unsigned long vm_end;
    914		unsigned long long pgoff;
    915		int major, minor;
    916		char r, w, x, s;
    917		unsigned long ino;
    918		int n;
    919
    920		n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
    921			   &vm_start,
    922			   &vm_end,
    923			   &r, &w, &x, &s,
    924			   &pgoff,
    925			   &major, &minor,
    926			   &ino);
    927		if (n < 10) {
    928			fprintf(stderr, "unexpected line: %s\n", buf);
    929			continue;
    930		}
    931		pg_start[nr_vmas] = vm_start / page_size;
    932		pg_end[nr_vmas] = vm_end / page_size;
    933		if (++nr_vmas >= MAX_VMAS) {
    934			fprintf(stderr, "too many VMAs\n");
    935			break;
    936		}
    937	}
    938	fclose(file);
    939}
    940
    941static void show_file(const char *name, const struct stat *st)
    942{
    943	unsigned long long size = st->st_size;
    944	char atime[64], mtime[64];
    945	long now = time(NULL);
    946
    947	printf("%s\tInode: %u\tSize: %llu (%llu pages)\n",
    948			name, (unsigned)st->st_ino,
    949			size, (size + page_size - 1) / page_size);
    950
    951	strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime));
    952	strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime));
    953
    954	printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n",
    955			mtime, now - st->st_mtime,
    956			atime, now - st->st_atime);
    957}
    958
    959static sigjmp_buf sigbus_jmp;
    960
    961static void * volatile sigbus_addr;
    962
    963static void sigbus_handler(int sig, siginfo_t *info, void *ucontex)
    964{
    965	(void)sig;
    966	(void)ucontex;
    967	sigbus_addr = info ? info->si_addr : NULL;
    968	siglongjmp(sigbus_jmp, 1);
    969}
    970
    971static struct sigaction sigbus_action = {
    972	.sa_sigaction = sigbus_handler,
    973	.sa_flags = SA_SIGINFO,
    974};
    975
    976static void walk_file_range(const char *name, int fd,
    977			    unsigned long off, unsigned long end)
    978{
    979	uint8_t vec[PAGEMAP_BATCH];
    980	uint64_t buf[PAGEMAP_BATCH], flags;
    981	uint64_t cgroup = 0;
    982	uint64_t mapcnt = 0;
    983	unsigned long nr_pages, pfn, i;
    984	ssize_t len;
    985	void *ptr;
    986	int first = 1;
    987
    988	for (; off < end; off += len) {
    989		nr_pages = (end - off + page_size - 1) / page_size;
    990		if (nr_pages > PAGEMAP_BATCH)
    991			nr_pages = PAGEMAP_BATCH;
    992		len = nr_pages * page_size;
    993
    994		ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off);
    995		if (ptr == MAP_FAILED)
    996			fatal("mmap failed: %s", name);
    997
    998		/* determine cached pages */
    999		if (mincore(ptr, len, vec))
   1000			fatal("mincore failed: %s", name);
   1001
   1002		/* turn off readahead */
   1003		if (madvise(ptr, len, MADV_RANDOM))
   1004			fatal("madvice failed: %s", name);
   1005
   1006		if (sigsetjmp(sigbus_jmp, 1)) {
   1007			end = off + sigbus_addr ? sigbus_addr - ptr : 0;
   1008			fprintf(stderr, "got sigbus at offset %lld: %s\n",
   1009					(long long)end, name);
   1010			goto got_sigbus;
   1011		}
   1012
   1013		/* populate ptes */
   1014		for (i = 0; i < nr_pages ; i++) {
   1015			if (vec[i] & 1)
   1016				(void)*(volatile int *)(ptr + i * page_size);
   1017		}
   1018got_sigbus:
   1019
   1020		/* turn off harvesting reference bits */
   1021		if (madvise(ptr, len, MADV_SEQUENTIAL))
   1022			fatal("madvice failed: %s", name);
   1023
   1024		if (pagemap_read(buf, (unsigned long)ptr / page_size,
   1025					nr_pages) != nr_pages)
   1026			fatal("cannot read pagemap");
   1027
   1028		munmap(ptr, len);
   1029
   1030		for (i = 0; i < nr_pages; i++) {
   1031			pfn = pagemap_pfn(buf[i]);
   1032			if (!pfn)
   1033				continue;
   1034			if (!kpageflags_read(&flags, pfn, 1))
   1035				continue;
   1036			if (!kpagecgroup_read(&cgroup, pfn, 1))
   1037				fatal("kpagecgroup_read failed");
   1038			if (!kpagecount_read(&mapcnt, pfn, 1))
   1039				fatal("kpagecount_read failed");
   1040			if (first && opt_list) {
   1041				first = 0;
   1042				flush_page_range();
   1043			}
   1044			add_page(off / page_size + i, pfn,
   1045				 flags, cgroup, mapcnt, buf[i]);
   1046		}
   1047	}
   1048}
   1049
   1050static void walk_file(const char *name, const struct stat *st)
   1051{
   1052	int i;
   1053	int fd;
   1054
   1055	fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
   1056
   1057	if (!nr_addr_ranges)
   1058		add_addr_range(0, st->st_size / page_size);
   1059
   1060	for (i = 0; i < nr_addr_ranges; i++)
   1061		walk_file_range(name, fd, opt_offset[i] * page_size,
   1062				(opt_offset[i] + opt_size[i]) * page_size);
   1063
   1064	close(fd);
   1065}
   1066
   1067int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f)
   1068{
   1069	(void)f;
   1070	switch (type) {
   1071	case FTW_F:
   1072		if (S_ISREG(st->st_mode))
   1073			walk_file(name, st);
   1074		break;
   1075	case FTW_DNR:
   1076		fprintf(stderr, "cannot read dir: %s\n", name);
   1077		break;
   1078	}
   1079	return 0;
   1080}
   1081
   1082struct stat st;
   1083
   1084static void walk_page_cache(void)
   1085{
   1086	kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
   1087	pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
   1088	sigaction(SIGBUS, &sigbus_action, NULL);
   1089
   1090	if (stat(opt_file, &st))
   1091		fatal("stat failed: %s\n", opt_file);
   1092
   1093	if (S_ISREG(st.st_mode)) {
   1094		walk_file(opt_file, &st);
   1095	} else if (S_ISDIR(st.st_mode)) {
   1096		/* do not follow symlinks and mountpoints */
   1097		if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0)
   1098			fatal("nftw failed: %s\n", opt_file);
   1099	} else
   1100		fatal("unhandled file type: %s\n", opt_file);
   1101
   1102	close(kpageflags_fd);
   1103	close(pagemap_fd);
   1104	signal(SIGBUS, SIG_DFL);
   1105}
   1106
   1107static void parse_file(const char *name)
   1108{
   1109	opt_file = name;
   1110}
   1111
   1112static void parse_cgroup(const char *path)
   1113{
   1114	if (path[0] == '@') {
   1115		opt_cgroup = parse_number(path + 1);
   1116		return;
   1117	}
   1118
   1119	struct stat st;
   1120
   1121	if (stat(path, &st))
   1122		fatal("stat failed: %s: %m\n", path);
   1123
   1124	if (!S_ISDIR(st.st_mode))
   1125		fatal("cgroup supposed to be a directory: %s\n", path);
   1126
   1127	opt_cgroup = st.st_ino;
   1128}
   1129
   1130static void parse_addr_range(const char *optarg)
   1131{
   1132	unsigned long offset;
   1133	unsigned long size;
   1134	char *p;
   1135
   1136	p = strchr(optarg, ',');
   1137	if (!p)
   1138		p = strchr(optarg, '+');
   1139
   1140	if (p == optarg) {
   1141		offset = 0;
   1142		size   = parse_number(p + 1);
   1143	} else if (p) {
   1144		offset = parse_number(optarg);
   1145		if (p[1] == '\0')
   1146			size = ULONG_MAX;
   1147		else {
   1148			size = parse_number(p + 1);
   1149			if (*p == ',') {
   1150				if (size < offset)
   1151					fatal("invalid range: %lu,%lu\n",
   1152							offset, size);
   1153				size -= offset;
   1154			}
   1155		}
   1156	} else {
   1157		offset = parse_number(optarg);
   1158		size   = 1;
   1159	}
   1160
   1161	add_addr_range(offset, size);
   1162}
   1163
   1164static void add_bits_filter(uint64_t mask, uint64_t bits)
   1165{
   1166	if (nr_bit_filters >= MAX_BIT_FILTERS)
   1167		fatal("too much bit filters\n");
   1168
   1169	opt_mask[nr_bit_filters] = mask;
   1170	opt_bits[nr_bit_filters] = bits;
   1171	nr_bit_filters++;
   1172}
   1173
   1174static uint64_t parse_flag_name(const char *str, int len)
   1175{
   1176	size_t i;
   1177
   1178	if (!*str || !len)
   1179		return 0;
   1180
   1181	if (len <= 8 && !strncmp(str, "compound", len))
   1182		return BITS_COMPOUND;
   1183
   1184	for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
   1185		if (!page_flag_names[i])
   1186			continue;
   1187		if (!strncmp(str, page_flag_names[i] + 2, len))
   1188			return 1ULL << i;
   1189	}
   1190
   1191	return parse_number(str);
   1192}
   1193
   1194static uint64_t parse_flag_names(const char *str, int all)
   1195{
   1196	const char *p    = str;
   1197	uint64_t   flags = 0;
   1198
   1199	while (1) {
   1200		if (*p == ',' || *p == '=' || *p == '\0') {
   1201			if ((*str != '~') || (*str == '~' && all && *++str))
   1202				flags |= parse_flag_name(str, p - str);
   1203			if (*p != ',')
   1204				break;
   1205			str = p + 1;
   1206		}
   1207		p++;
   1208	}
   1209
   1210	return flags;
   1211}
   1212
   1213static void parse_bits_mask(const char *optarg)
   1214{
   1215	uint64_t mask;
   1216	uint64_t bits;
   1217	const char *p;
   1218
   1219	p = strchr(optarg, '=');
   1220	if (p == optarg) {
   1221		mask = KPF_ALL_BITS;
   1222		bits = parse_flag_names(p + 1, 0);
   1223	} else if (p) {
   1224		mask = parse_flag_names(optarg, 0);
   1225		bits = parse_flag_names(p + 1, 0);
   1226	} else if (strchr(optarg, '~')) {
   1227		mask = parse_flag_names(optarg, 1);
   1228		bits = parse_flag_names(optarg, 0);
   1229	} else {
   1230		mask = parse_flag_names(optarg, 0);
   1231		bits = KPF_ALL_BITS;
   1232	}
   1233
   1234	add_bits_filter(mask, bits);
   1235}
   1236
   1237static void parse_kpageflags(const char *name)
   1238{
   1239	opt_kpageflags = name;
   1240}
   1241
   1242static void describe_flags(const char *optarg)
   1243{
   1244	uint64_t flags = parse_flag_names(optarg, 0);
   1245
   1246	printf("0x%016llx\t%s\t%s\n",
   1247		(unsigned long long)flags,
   1248		page_flag_name(flags),
   1249		page_flag_longname(flags));
   1250}
   1251
   1252static const struct option opts[] = {
   1253	{ "raw"       , 0, NULL, 'r' },
   1254	{ "pid"       , 1, NULL, 'p' },
   1255	{ "file"      , 1, NULL, 'f' },
   1256	{ "addr"      , 1, NULL, 'a' },
   1257	{ "bits"      , 1, NULL, 'b' },
   1258	{ "cgroup"    , 1, NULL, 'c' },
   1259	{ "describe"  , 1, NULL, 'd' },
   1260	{ "mark-idle" , 0, NULL, 'i' },
   1261	{ "list"      , 0, NULL, 'l' },
   1262	{ "list-each" , 0, NULL, 'L' },
   1263	{ "list-cgroup", 0, NULL, 'C' },
   1264	{ "list-mapcnt", 0, NULL, 'M' },
   1265	{ "no-summary", 0, NULL, 'N' },
   1266	{ "hwpoison"  , 0, NULL, 'X' },
   1267	{ "unpoison"  , 0, NULL, 'x' },
   1268	{ "kpageflags", 0, NULL, 'F' },
   1269	{ "help"      , 0, NULL, 'h' },
   1270	{ NULL        , 0, NULL, 0 }
   1271};
   1272
   1273int main(int argc, char *argv[])
   1274{
   1275	int c;
   1276
   1277	page_size = getpagesize();
   1278
   1279	while ((c = getopt_long(argc, argv,
   1280				"rp:f:a:b:d:c:CilLMNXxF:h",
   1281				opts, NULL)) != -1) {
   1282		switch (c) {
   1283		case 'r':
   1284			opt_raw = 1;
   1285			break;
   1286		case 'p':
   1287			parse_pid(optarg);
   1288			break;
   1289		case 'f':
   1290			parse_file(optarg);
   1291			break;
   1292		case 'a':
   1293			parse_addr_range(optarg);
   1294			break;
   1295		case 'b':
   1296			parse_bits_mask(optarg);
   1297			break;
   1298		case 'c':
   1299			parse_cgroup(optarg);
   1300			break;
   1301		case 'C':
   1302			opt_list_cgroup = 1;
   1303			break;
   1304		case 'd':
   1305			describe_flags(optarg);
   1306			exit(0);
   1307		case 'i':
   1308			opt_mark_idle = 1;
   1309			break;
   1310		case 'l':
   1311			opt_list = 1;
   1312			break;
   1313		case 'L':
   1314			opt_list = 2;
   1315			break;
   1316		case 'M':
   1317			opt_list_mapcnt = 1;
   1318			break;
   1319		case 'N':
   1320			opt_no_summary = 1;
   1321			break;
   1322		case 'X':
   1323			opt_hwpoison = 1;
   1324			prepare_hwpoison_fd();
   1325			break;
   1326		case 'x':
   1327			opt_unpoison = 1;
   1328			prepare_hwpoison_fd();
   1329			break;
   1330		case 'F':
   1331			parse_kpageflags(optarg);
   1332			break;
   1333		case 'h':
   1334			usage();
   1335			exit(0);
   1336		default:
   1337			usage();
   1338			exit(1);
   1339		}
   1340	}
   1341
   1342	if (!opt_kpageflags)
   1343		opt_kpageflags = PROC_KPAGEFLAGS;
   1344
   1345	if (opt_cgroup || opt_list_cgroup)
   1346		kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
   1347
   1348	if (opt_list && opt_list_mapcnt)
   1349		kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY);
   1350
   1351	if (opt_mark_idle)
   1352		page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR);
   1353
   1354	if (opt_list && opt_pid)
   1355		printf("voffset\t");
   1356	if (opt_list && opt_file)
   1357		printf("foffset\t");
   1358	if (opt_list && opt_list_cgroup)
   1359		printf("cgroup\t");
   1360	if (opt_list && opt_list_mapcnt)
   1361		printf("map-cnt\t");
   1362
   1363	if (opt_list == 1)
   1364		printf("offset\tlen\tflags\n");
   1365	if (opt_list == 2)
   1366		printf("offset\tflags\n");
   1367
   1368	if (opt_file)
   1369		walk_page_cache();
   1370	else
   1371		walk_addr_ranges();
   1372
   1373	if (opt_list == 1)
   1374		flush_page_range();
   1375
   1376	if (opt_no_summary)
   1377		return 0;
   1378
   1379	if (opt_list)
   1380		printf("\n\n");
   1381
   1382	if (opt_file) {
   1383		show_file(opt_file, &st);
   1384		printf("\n");
   1385	}
   1386
   1387	show_summary();
   1388
   1389	if (opt_list_mapcnt)
   1390		close(kpagecount_fd);
   1391
   1392	if (page_idle_fd >= 0)
   1393		close(page_idle_fd);
   1394
   1395	return 0;
   1396}