cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

userfaultfd.c (45662B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Stress userfaultfd syscall.
      4 *
      5 *  Copyright (C) 2015  Red Hat, Inc.
      6 *
      7 * This test allocates two virtual areas and bounces the physical
      8 * memory across the two virtual areas (from area_src to area_dst)
      9 * using userfaultfd.
     10 *
     11 * There are three threads running per CPU:
     12 *
     13 * 1) one per-CPU thread takes a per-page pthread_mutex in a random
     14 *    page of the area_dst (while the physical page may still be in
     15 *    area_src), and increments a per-page counter in the same page,
     16 *    and checks its value against a verification region.
     17 *
     18 * 2) another per-CPU thread handles the userfaults generated by
     19 *    thread 1 above. userfaultfd blocking reads or poll() modes are
     20 *    exercised interleaved.
     21 *
     22 * 3) one last per-CPU thread transfers the memory in the background
     23 *    at maximum bandwidth (if not already transferred by thread
     24 *    2). Each cpu thread takes cares of transferring a portion of the
     25 *    area.
     26 *
     27 * When all threads of type 3 completed the transfer, one bounce is
     28 * complete. area_src and area_dst are then swapped. All threads are
     29 * respawned and so the bounce is immediately restarted in the
     30 * opposite direction.
     31 *
     32 * per-CPU threads 1 by triggering userfaults inside
     33 * pthread_mutex_lock will also verify the atomicity of the memory
     34 * transfer (UFFDIO_COPY).
     35 */
     36
     37#define _GNU_SOURCE
     38#include <stdio.h>
     39#include <errno.h>
     40#include <unistd.h>
     41#include <stdlib.h>
     42#include <sys/types.h>
     43#include <sys/stat.h>
     44#include <fcntl.h>
     45#include <time.h>
     46#include <signal.h>
     47#include <poll.h>
     48#include <string.h>
     49#include <linux/mman.h>
     50#include <sys/mman.h>
     51#include <sys/syscall.h>
     52#include <sys/ioctl.h>
     53#include <sys/wait.h>
     54#include <pthread.h>
     55#include <linux/userfaultfd.h>
     56#include <setjmp.h>
     57#include <stdbool.h>
     58#include <assert.h>
     59#include <inttypes.h>
     60#include <stdint.h>
     61#include <sys/random.h>
     62
     63#include "../kselftest.h"
     64
     65#ifdef __NR_userfaultfd
     66
     67static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
     68
     69#define BOUNCE_RANDOM		(1<<0)
     70#define BOUNCE_RACINGFAULTS	(1<<1)
     71#define BOUNCE_VERIFY		(1<<2)
     72#define BOUNCE_POLL		(1<<3)
     73static int bounces;
     74
     75#define TEST_ANON	1
     76#define TEST_HUGETLB	2
     77#define TEST_SHMEM	3
     78static int test_type;
     79
     80/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
     81#define ALARM_INTERVAL_SECS 10
     82static volatile bool test_uffdio_copy_eexist = true;
     83static volatile bool test_uffdio_zeropage_eexist = true;
     84/* Whether to test uffd write-protection */
     85static bool test_uffdio_wp = true;
     86/* Whether to test uffd minor faults */
     87static bool test_uffdio_minor = false;
     88
     89static bool map_shared;
     90static int shm_fd;
     91static int huge_fd;
     92static unsigned long long *count_verify;
     93static int uffd = -1;
     94static int uffd_flags, finished, *pipefd;
     95static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
     96static char *zeropage;
     97pthread_attr_t attr;
     98
     99/* Userfaultfd test statistics */
    100struct uffd_stats {
    101	int cpu;
    102	unsigned long missing_faults;
    103	unsigned long wp_faults;
    104	unsigned long minor_faults;
    105};
    106
    107/* pthread_mutex_t starts at page offset 0 */
    108#define area_mutex(___area, ___nr)					\
    109	((pthread_mutex_t *) ((___area) + (___nr)*page_size))
    110/*
    111 * count is placed in the page after pthread_mutex_t naturally aligned
    112 * to avoid non alignment faults on non-x86 archs.
    113 */
    114#define area_count(___area, ___nr)					\
    115	((volatile unsigned long long *) ((unsigned long)		\
    116				 ((___area) + (___nr)*page_size +	\
    117				  sizeof(pthread_mutex_t) +		\
    118				  sizeof(unsigned long long) - 1) &	\
    119				 ~(unsigned long)(sizeof(unsigned long long) \
    120						  -  1)))
    121
    122#define swap(a, b) \
    123	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
    124
    125const char *examples =
    126    "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
    127    "./userfaultfd anon 100 99999\n\n"
    128    "# Run share memory test on 1GiB region with 99 bounces:\n"
    129    "./userfaultfd shmem 1000 99\n\n"
    130    "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
    131    "./userfaultfd hugetlb 256 50\n\n"
    132    "# Run the same hugetlb test but using shared file:\n"
    133    "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
    134    "# 10MiB-~6GiB 999 bounces anonymous test, "
    135    "continue forever unless an error triggers\n"
    136    "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
    137
    138static void usage(void)
    139{
    140	fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
    141		"[hugetlbfs_file]\n\n");
    142	fprintf(stderr, "Supported <test type>: anon, hugetlb, "
    143		"hugetlb_shared, shmem\n\n");
    144	fprintf(stderr, "Examples:\n\n");
    145	fprintf(stderr, "%s", examples);
    146	exit(1);
    147}
    148
    149#define _err(fmt, ...)						\
    150	do {							\
    151		int ret = errno;				\
    152		fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);	\
    153		fprintf(stderr, " (errno=%d, line=%d)\n",	\
    154			ret, __LINE__);				\
    155	} while (0)
    156
    157#define err(fmt, ...)				\
    158	do {					\
    159		_err(fmt, ##__VA_ARGS__);	\
    160		exit(1);			\
    161	} while (0)
    162
    163static void uffd_stats_reset(struct uffd_stats *uffd_stats,
    164			     unsigned long n_cpus)
    165{
    166	int i;
    167
    168	for (i = 0; i < n_cpus; i++) {
    169		uffd_stats[i].cpu = i;
    170		uffd_stats[i].missing_faults = 0;
    171		uffd_stats[i].wp_faults = 0;
    172		uffd_stats[i].minor_faults = 0;
    173	}
    174}
    175
    176static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
    177{
    178	int i;
    179	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
    180
    181	for (i = 0; i < n_cpus; i++) {
    182		miss_total += stats[i].missing_faults;
    183		wp_total += stats[i].wp_faults;
    184		minor_total += stats[i].minor_faults;
    185	}
    186
    187	printf("userfaults: ");
    188	if (miss_total) {
    189		printf("%llu missing (", miss_total);
    190		for (i = 0; i < n_cpus; i++)
    191			printf("%lu+", stats[i].missing_faults);
    192		printf("\b) ");
    193	}
    194	if (wp_total) {
    195		printf("%llu wp (", wp_total);
    196		for (i = 0; i < n_cpus; i++)
    197			printf("%lu+", stats[i].wp_faults);
    198		printf("\b) ");
    199	}
    200	if (minor_total) {
    201		printf("%llu minor (", minor_total);
    202		for (i = 0; i < n_cpus; i++)
    203			printf("%lu+", stats[i].minor_faults);
    204		printf("\b)");
    205	}
    206	printf("\n");
    207}
    208
    209static void anon_release_pages(char *rel_area)
    210{
    211	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
    212		err("madvise(MADV_DONTNEED) failed");
    213}
    214
    215static void anon_allocate_area(void **alloc_area)
    216{
    217	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
    218			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
    219	if (*alloc_area == MAP_FAILED)
    220		err("mmap of anonymous memory failed");
    221}
    222
    223static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
    224{
    225}
    226
    227static void hugetlb_release_pages(char *rel_area)
    228{
    229	if (!map_shared) {
    230		if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
    231			err("madvise(MADV_DONTNEED) failed");
    232	} else {
    233		if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
    234			err("madvise(MADV_REMOVE) failed");
    235	}
    236}
    237
    238static void hugetlb_allocate_area(void **alloc_area)
    239{
    240	void *area_alias = NULL;
    241	char **alloc_area_alias;
    242
    243	if (!map_shared)
    244		*alloc_area = mmap(NULL,
    245			nr_pages * page_size,
    246			PROT_READ | PROT_WRITE,
    247			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB |
    248				(*alloc_area == area_src ? 0 : MAP_NORESERVE),
    249			-1,
    250			0);
    251	else
    252		*alloc_area = mmap(NULL,
    253			nr_pages * page_size,
    254			PROT_READ | PROT_WRITE,
    255			MAP_SHARED |
    256				(*alloc_area == area_src ? 0 : MAP_NORESERVE),
    257			huge_fd,
    258			*alloc_area == area_src ? 0 : nr_pages * page_size);
    259	if (*alloc_area == MAP_FAILED)
    260		err("mmap of hugetlbfs file failed");
    261
    262	if (map_shared) {
    263		area_alias = mmap(NULL,
    264			nr_pages * page_size,
    265			PROT_READ | PROT_WRITE,
    266			MAP_SHARED,
    267			huge_fd,
    268			*alloc_area == area_src ? 0 : nr_pages * page_size);
    269		if (area_alias == MAP_FAILED)
    270			err("mmap of hugetlb file alias failed");
    271	}
    272
    273	if (*alloc_area == area_src) {
    274		alloc_area_alias = &area_src_alias;
    275	} else {
    276		alloc_area_alias = &area_dst_alias;
    277	}
    278	if (area_alias)
    279		*alloc_area_alias = area_alias;
    280}
    281
    282static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
    283{
    284	if (!map_shared)
    285		return;
    286
    287	*start = (unsigned long) area_dst_alias + offset;
    288}
    289
    290static void shmem_release_pages(char *rel_area)
    291{
    292	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
    293		err("madvise(MADV_REMOVE) failed");
    294}
    295
    296static void shmem_allocate_area(void **alloc_area)
    297{
    298	void *area_alias = NULL;
    299	bool is_src = alloc_area == (void **)&area_src;
    300	unsigned long offset = is_src ? 0 : nr_pages * page_size;
    301
    302	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
    303			   MAP_SHARED, shm_fd, offset);
    304	if (*alloc_area == MAP_FAILED)
    305		err("mmap of memfd failed");
    306
    307	area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
    308			  MAP_SHARED, shm_fd, offset);
    309	if (area_alias == MAP_FAILED)
    310		err("mmap of memfd alias failed");
    311
    312	if (is_src)
    313		area_src_alias = area_alias;
    314	else
    315		area_dst_alias = area_alias;
    316}
    317
    318static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
    319{
    320	*start = (unsigned long)area_dst_alias + offset;
    321}
    322
    323struct uffd_test_ops {
    324	void (*allocate_area)(void **alloc_area);
    325	void (*release_pages)(char *rel_area);
    326	void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
    327};
    328
    329static struct uffd_test_ops anon_uffd_test_ops = {
    330	.allocate_area	= anon_allocate_area,
    331	.release_pages	= anon_release_pages,
    332	.alias_mapping = noop_alias_mapping,
    333};
    334
    335static struct uffd_test_ops shmem_uffd_test_ops = {
    336	.allocate_area	= shmem_allocate_area,
    337	.release_pages	= shmem_release_pages,
    338	.alias_mapping = shmem_alias_mapping,
    339};
    340
    341static struct uffd_test_ops hugetlb_uffd_test_ops = {
    342	.allocate_area	= hugetlb_allocate_area,
    343	.release_pages	= hugetlb_release_pages,
    344	.alias_mapping = hugetlb_alias_mapping,
    345};
    346
    347static struct uffd_test_ops *uffd_test_ops;
    348
    349static inline uint64_t uffd_minor_feature(void)
    350{
    351	if (test_type == TEST_HUGETLB && map_shared)
    352		return UFFD_FEATURE_MINOR_HUGETLBFS;
    353	else if (test_type == TEST_SHMEM)
    354		return UFFD_FEATURE_MINOR_SHMEM;
    355	else
    356		return 0;
    357}
    358
    359static uint64_t get_expected_ioctls(uint64_t mode)
    360{
    361	uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
    362
    363	if (test_type == TEST_HUGETLB)
    364		ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
    365
    366	if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
    367		ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
    368
    369	if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
    370		ioctls &= ~(1 << _UFFDIO_CONTINUE);
    371
    372	return ioctls;
    373}
    374
    375static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
    376{
    377	uint64_t expected = get_expected_ioctls(mode);
    378	uint64_t actual = ioctls & expected;
    379
    380	if (actual != expected) {
    381		err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
    382		    expected, actual);
    383	}
    384}
    385
    386static void userfaultfd_open(uint64_t *features)
    387{
    388	struct uffdio_api uffdio_api;
    389
    390	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
    391	if (uffd < 0)
    392		err("userfaultfd syscall not available in this kernel");
    393	uffd_flags = fcntl(uffd, F_GETFD, NULL);
    394
    395	uffdio_api.api = UFFD_API;
    396	uffdio_api.features = *features;
    397	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
    398		err("UFFDIO_API failed.\nPlease make sure to "
    399		    "run with either root or ptrace capability.");
    400	if (uffdio_api.api != UFFD_API)
    401		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
    402
    403	*features = uffdio_api.features;
    404}
    405
    406static inline void munmap_area(void **area)
    407{
    408	if (*area)
    409		if (munmap(*area, nr_pages * page_size))
    410			err("munmap");
    411
    412	*area = NULL;
    413}
    414
    415static void uffd_test_ctx_clear(void)
    416{
    417	size_t i;
    418
    419	if (pipefd) {
    420		for (i = 0; i < nr_cpus * 2; ++i) {
    421			if (close(pipefd[i]))
    422				err("close pipefd");
    423		}
    424		free(pipefd);
    425		pipefd = NULL;
    426	}
    427
    428	if (count_verify) {
    429		free(count_verify);
    430		count_verify = NULL;
    431	}
    432
    433	if (uffd != -1) {
    434		if (close(uffd))
    435			err("close uffd");
    436		uffd = -1;
    437	}
    438
    439	munmap_area((void **)&area_src);
    440	munmap_area((void **)&area_src_alias);
    441	munmap_area((void **)&area_dst);
    442	munmap_area((void **)&area_dst_alias);
    443}
    444
    445static void uffd_test_ctx_init(uint64_t features)
    446{
    447	unsigned long nr, cpu;
    448
    449	uffd_test_ctx_clear();
    450
    451	uffd_test_ops->allocate_area((void **)&area_src);
    452	uffd_test_ops->allocate_area((void **)&area_dst);
    453
    454	userfaultfd_open(&features);
    455
    456	count_verify = malloc(nr_pages * sizeof(unsigned long long));
    457	if (!count_verify)
    458		err("count_verify");
    459
    460	for (nr = 0; nr < nr_pages; nr++) {
    461		*area_mutex(area_src, nr) =
    462			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
    463		count_verify[nr] = *area_count(area_src, nr) = 1;
    464		/*
    465		 * In the transition between 255 to 256, powerpc will
    466		 * read out of order in my_bcmp and see both bytes as
    467		 * zero, so leave a placeholder below always non-zero
    468		 * after the count, to avoid my_bcmp to trigger false
    469		 * positives.
    470		 */
    471		*(area_count(area_src, nr) + 1) = 1;
    472	}
    473
    474	/*
    475	 * After initialization of area_src, we must explicitly release pages
    476	 * for area_dst to make sure it's fully empty.  Otherwise we could have
    477	 * some area_dst pages be errornously initialized with zero pages,
    478	 * hence we could hit memory corruption later in the test.
    479	 *
    480	 * One example is when THP is globally enabled, above allocate_area()
    481	 * calls could have the two areas merged into a single VMA (as they
    482	 * will have the same VMA flags so they're mergeable).  When we
    483	 * initialize the area_src above, it's possible that some part of
    484	 * area_dst could have been faulted in via one huge THP that will be
    485	 * shared between area_src and area_dst.  It could cause some of the
    486	 * area_dst won't be trapped by missing userfaults.
    487	 *
    488	 * This release_pages() will guarantee even if that happened, we'll
    489	 * proactively split the thp and drop any accidentally initialized
    490	 * pages within area_dst.
    491	 */
    492	uffd_test_ops->release_pages(area_dst);
    493
    494	pipefd = malloc(sizeof(int) * nr_cpus * 2);
    495	if (!pipefd)
    496		err("pipefd");
    497	for (cpu = 0; cpu < nr_cpus; cpu++)
    498		if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
    499			err("pipe");
    500}
    501
    502static int my_bcmp(char *str1, char *str2, size_t n)
    503{
    504	unsigned long i;
    505	for (i = 0; i < n; i++)
    506		if (str1[i] != str2[i])
    507			return 1;
    508	return 0;
    509}
    510
    511static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
    512{
    513	struct uffdio_writeprotect prms;
    514
    515	/* Write protection page faults */
    516	prms.range.start = start;
    517	prms.range.len = len;
    518	/* Undo write-protect, do wakeup after that */
    519	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
    520
    521	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
    522		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
    523}
    524
    525static void continue_range(int ufd, __u64 start, __u64 len)
    526{
    527	struct uffdio_continue req;
    528	int ret;
    529
    530	req.range.start = start;
    531	req.range.len = len;
    532	req.mode = 0;
    533
    534	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
    535		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
    536		    (uint64_t)start);
    537
    538	/*
    539	 * Error handling within the kernel for continue is subtly different
    540	 * from copy or zeropage, so it may be a source of bugs. Trigger an
    541	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
    542	 */
    543	req.mapped = 0;
    544	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
    545	if (ret >= 0 || req.mapped != -EEXIST)
    546		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
    547		    ret, (int64_t) req.mapped);
    548}
    549
    550static void *locking_thread(void *arg)
    551{
    552	unsigned long cpu = (unsigned long) arg;
    553	unsigned long page_nr;
    554	unsigned long long count;
    555
    556	if (!(bounces & BOUNCE_RANDOM)) {
    557		page_nr = -bounces;
    558		if (!(bounces & BOUNCE_RACINGFAULTS))
    559			page_nr += cpu * nr_pages_per_cpu;
    560	}
    561
    562	while (!finished) {
    563		if (bounces & BOUNCE_RANDOM) {
    564			if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
    565				err("getrandom failed");
    566		} else
    567			page_nr += 1;
    568		page_nr %= nr_pages;
    569		pthread_mutex_lock(area_mutex(area_dst, page_nr));
    570		count = *area_count(area_dst, page_nr);
    571		if (count != count_verify[page_nr])
    572			err("page_nr %lu memory corruption %llu %llu",
    573			    page_nr, count, count_verify[page_nr]);
    574		count++;
    575		*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
    576		pthread_mutex_unlock(area_mutex(area_dst, page_nr));
    577	}
    578
    579	return NULL;
    580}
    581
    582static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
    583			    unsigned long offset)
    584{
    585	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
    586				     uffdio_copy->len,
    587				     offset);
    588	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
    589		/* real retval in ufdio_copy.copy */
    590		if (uffdio_copy->copy != -EEXIST)
    591			err("UFFDIO_COPY retry error: %"PRId64,
    592			    (int64_t)uffdio_copy->copy);
    593	} else {
    594		err("UFFDIO_COPY retry unexpected: %"PRId64,
    595		    (int64_t)uffdio_copy->copy);
    596	}
    597}
    598
    599static void wake_range(int ufd, unsigned long addr, unsigned long len)
    600{
    601	struct uffdio_range uffdio_wake;
    602
    603	uffdio_wake.start = addr;
    604	uffdio_wake.len = len;
    605
    606	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
    607		fprintf(stderr, "error waking %lu\n",
    608			addr), exit(1);
    609}
    610
    611static int __copy_page(int ufd, unsigned long offset, bool retry)
    612{
    613	struct uffdio_copy uffdio_copy;
    614
    615	if (offset >= nr_pages * page_size)
    616		err("unexpected offset %lu\n", offset);
    617	uffdio_copy.dst = (unsigned long) area_dst + offset;
    618	uffdio_copy.src = (unsigned long) area_src + offset;
    619	uffdio_copy.len = page_size;
    620	if (test_uffdio_wp)
    621		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
    622	else
    623		uffdio_copy.mode = 0;
    624	uffdio_copy.copy = 0;
    625	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
    626		/* real retval in ufdio_copy.copy */
    627		if (uffdio_copy.copy != -EEXIST)
    628			err("UFFDIO_COPY error: %"PRId64,
    629			    (int64_t)uffdio_copy.copy);
    630		wake_range(ufd, uffdio_copy.dst, page_size);
    631	} else if (uffdio_copy.copy != page_size) {
    632		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
    633	} else {
    634		if (test_uffdio_copy_eexist && retry) {
    635			test_uffdio_copy_eexist = false;
    636			retry_copy_page(ufd, &uffdio_copy, offset);
    637		}
    638		return 1;
    639	}
    640	return 0;
    641}
    642
    643static int copy_page_retry(int ufd, unsigned long offset)
    644{
    645	return __copy_page(ufd, offset, true);
    646}
    647
    648static int copy_page(int ufd, unsigned long offset)
    649{
    650	return __copy_page(ufd, offset, false);
    651}
    652
    653static int uffd_read_msg(int ufd, struct uffd_msg *msg)
    654{
    655	int ret = read(uffd, msg, sizeof(*msg));
    656
    657	if (ret != sizeof(*msg)) {
    658		if (ret < 0) {
    659			if (errno == EAGAIN || errno == EINTR)
    660				return 1;
    661			err("blocking read error");
    662		} else {
    663			err("short read");
    664		}
    665	}
    666
    667	return 0;
    668}
    669
    670static void uffd_handle_page_fault(struct uffd_msg *msg,
    671				   struct uffd_stats *stats)
    672{
    673	unsigned long offset;
    674
    675	if (msg->event != UFFD_EVENT_PAGEFAULT)
    676		err("unexpected msg event %u", msg->event);
    677
    678	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
    679		/* Write protect page faults */
    680		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
    681		stats->wp_faults++;
    682	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
    683		uint8_t *area;
    684		int b;
    685
    686		/*
    687		 * Minor page faults
    688		 *
    689		 * To prove we can modify the original range for testing
    690		 * purposes, we're going to bit flip this range before
    691		 * continuing.
    692		 *
    693		 * Note that this requires all minor page fault tests operate on
    694		 * area_dst (non-UFFD-registered) and area_dst_alias
    695		 * (UFFD-registered).
    696		 */
    697
    698		area = (uint8_t *)(area_dst +
    699				   ((char *)msg->arg.pagefault.address -
    700				    area_dst_alias));
    701		for (b = 0; b < page_size; ++b)
    702			area[b] = ~area[b];
    703		continue_range(uffd, msg->arg.pagefault.address, page_size);
    704		stats->minor_faults++;
    705	} else {
    706		/* Missing page faults */
    707		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
    708			err("unexpected write fault");
    709
    710		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
    711		offset &= ~(page_size-1);
    712
    713		if (copy_page(uffd, offset))
    714			stats->missing_faults++;
    715	}
    716}
    717
    718static void *uffd_poll_thread(void *arg)
    719{
    720	struct uffd_stats *stats = (struct uffd_stats *)arg;
    721	unsigned long cpu = stats->cpu;
    722	struct pollfd pollfd[2];
    723	struct uffd_msg msg;
    724	struct uffdio_register uffd_reg;
    725	int ret;
    726	char tmp_chr;
    727
    728	pollfd[0].fd = uffd;
    729	pollfd[0].events = POLLIN;
    730	pollfd[1].fd = pipefd[cpu*2];
    731	pollfd[1].events = POLLIN;
    732
    733	for (;;) {
    734		ret = poll(pollfd, 2, -1);
    735		if (ret <= 0) {
    736			if (errno == EINTR || errno == EAGAIN)
    737				continue;
    738			err("poll error: %d", ret);
    739		}
    740		if (pollfd[1].revents & POLLIN) {
    741			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
    742				err("read pipefd error");
    743			break;
    744		}
    745		if (!(pollfd[0].revents & POLLIN))
    746			err("pollfd[0].revents %d", pollfd[0].revents);
    747		if (uffd_read_msg(uffd, &msg))
    748			continue;
    749		switch (msg.event) {
    750		default:
    751			err("unexpected msg event %u\n", msg.event);
    752			break;
    753		case UFFD_EVENT_PAGEFAULT:
    754			uffd_handle_page_fault(&msg, stats);
    755			break;
    756		case UFFD_EVENT_FORK:
    757			close(uffd);
    758			uffd = msg.arg.fork.ufd;
    759			pollfd[0].fd = uffd;
    760			break;
    761		case UFFD_EVENT_REMOVE:
    762			uffd_reg.range.start = msg.arg.remove.start;
    763			uffd_reg.range.len = msg.arg.remove.end -
    764				msg.arg.remove.start;
    765			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
    766				err("remove failure");
    767			break;
    768		case UFFD_EVENT_REMAP:
    769			area_dst = (char *)(unsigned long)msg.arg.remap.to;
    770			break;
    771		}
    772	}
    773
    774	return NULL;
    775}
    776
    777pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
    778
    779static void *uffd_read_thread(void *arg)
    780{
    781	struct uffd_stats *stats = (struct uffd_stats *)arg;
    782	struct uffd_msg msg;
    783
    784	pthread_mutex_unlock(&uffd_read_mutex);
    785	/* from here cancellation is ok */
    786
    787	for (;;) {
    788		if (uffd_read_msg(uffd, &msg))
    789			continue;
    790		uffd_handle_page_fault(&msg, stats);
    791	}
    792
    793	return NULL;
    794}
    795
    796static void *background_thread(void *arg)
    797{
    798	unsigned long cpu = (unsigned long) arg;
    799	unsigned long page_nr, start_nr, mid_nr, end_nr;
    800
    801	start_nr = cpu * nr_pages_per_cpu;
    802	end_nr = (cpu+1) * nr_pages_per_cpu;
    803	mid_nr = (start_nr + end_nr) / 2;
    804
    805	/* Copy the first half of the pages */
    806	for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
    807		copy_page_retry(uffd, page_nr * page_size);
    808
    809	/*
    810	 * If we need to test uffd-wp, set it up now.  Then we'll have
    811	 * at least the first half of the pages mapped already which
    812	 * can be write-protected for testing
    813	 */
    814	if (test_uffdio_wp)
    815		wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
    816			nr_pages_per_cpu * page_size, true);
    817
    818	/*
    819	 * Continue the 2nd half of the page copying, handling write
    820	 * protection faults if any
    821	 */
    822	for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
    823		copy_page_retry(uffd, page_nr * page_size);
    824
    825	return NULL;
    826}
    827
    828static int stress(struct uffd_stats *uffd_stats)
    829{
    830	unsigned long cpu;
    831	pthread_t locking_threads[nr_cpus];
    832	pthread_t uffd_threads[nr_cpus];
    833	pthread_t background_threads[nr_cpus];
    834
    835	finished = 0;
    836	for (cpu = 0; cpu < nr_cpus; cpu++) {
    837		if (pthread_create(&locking_threads[cpu], &attr,
    838				   locking_thread, (void *)cpu))
    839			return 1;
    840		if (bounces & BOUNCE_POLL) {
    841			if (pthread_create(&uffd_threads[cpu], &attr,
    842					   uffd_poll_thread,
    843					   (void *)&uffd_stats[cpu]))
    844				return 1;
    845		} else {
    846			if (pthread_create(&uffd_threads[cpu], &attr,
    847					   uffd_read_thread,
    848					   (void *)&uffd_stats[cpu]))
    849				return 1;
    850			pthread_mutex_lock(&uffd_read_mutex);
    851		}
    852		if (pthread_create(&background_threads[cpu], &attr,
    853				   background_thread, (void *)cpu))
    854			return 1;
    855	}
    856	for (cpu = 0; cpu < nr_cpus; cpu++)
    857		if (pthread_join(background_threads[cpu], NULL))
    858			return 1;
    859
    860	/*
    861	 * Be strict and immediately zap area_src, the whole area has
    862	 * been transferred already by the background treads. The
    863	 * area_src could then be faulted in in a racy way by still
    864	 * running uffdio_threads reading zeropages after we zapped
    865	 * area_src (but they're guaranteed to get -EEXIST from
    866	 * UFFDIO_COPY without writing zero pages into area_dst
    867	 * because the background threads already completed).
    868	 */
    869	uffd_test_ops->release_pages(area_src);
    870
    871	finished = 1;
    872	for (cpu = 0; cpu < nr_cpus; cpu++)
    873		if (pthread_join(locking_threads[cpu], NULL))
    874			return 1;
    875
    876	for (cpu = 0; cpu < nr_cpus; cpu++) {
    877		char c;
    878		if (bounces & BOUNCE_POLL) {
    879			if (write(pipefd[cpu*2+1], &c, 1) != 1)
    880				err("pipefd write error");
    881			if (pthread_join(uffd_threads[cpu],
    882					 (void *)&uffd_stats[cpu]))
    883				return 1;
    884		} else {
    885			if (pthread_cancel(uffd_threads[cpu]))
    886				return 1;
    887			if (pthread_join(uffd_threads[cpu], NULL))
    888				return 1;
    889		}
    890	}
    891
    892	return 0;
    893}
    894
    895sigjmp_buf jbuf, *sigbuf;
    896
    897static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
    898{
    899	if (sig == SIGBUS) {
    900		if (sigbuf)
    901			siglongjmp(*sigbuf, 1);
    902		abort();
    903	}
    904}
    905
    906/*
    907 * For non-cooperative userfaultfd test we fork() a process that will
    908 * generate pagefaults, will mremap the area monitored by the
    909 * userfaultfd and at last this process will release the monitored
    910 * area.
    911 * For the anonymous and shared memory the area is divided into two
    912 * parts, the first part is accessed before mremap, and the second
    913 * part is accessed after mremap. Since hugetlbfs does not support
    914 * mremap, the entire monitored area is accessed in a single pass for
    915 * HUGETLB_TEST.
    916 * The release of the pages currently generates event for shmem and
    917 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
    918 * for hugetlb.
    919 * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
    920 * monitored area, generate pagefaults and test that signal is delivered.
    921 * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
    922 * test robustness use case - we release monitored area, fork a process
    923 * that will generate pagefaults and verify signal is generated.
    924 * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
    925 * feature. Using monitor thread, verify no userfault events are generated.
    926 */
    927static int faulting_process(int signal_test)
    928{
    929	unsigned long nr;
    930	unsigned long long count;
    931	unsigned long split_nr_pages;
    932	unsigned long lastnr;
    933	struct sigaction act;
    934	unsigned long signalled = 0;
    935
    936	split_nr_pages = (nr_pages + 1) / 2;
    937
    938	if (signal_test) {
    939		sigbuf = &jbuf;
    940		memset(&act, 0, sizeof(act));
    941		act.sa_sigaction = sighndl;
    942		act.sa_flags = SA_SIGINFO;
    943		if (sigaction(SIGBUS, &act, 0))
    944			err("sigaction");
    945		lastnr = (unsigned long)-1;
    946	}
    947
    948	for (nr = 0; nr < split_nr_pages; nr++) {
    949		int steps = 1;
    950		unsigned long offset = nr * page_size;
    951
    952		if (signal_test) {
    953			if (sigsetjmp(*sigbuf, 1) != 0) {
    954				if (steps == 1 && nr == lastnr)
    955					err("Signal repeated");
    956
    957				lastnr = nr;
    958				if (signal_test == 1) {
    959					if (steps == 1) {
    960						/* This is a MISSING request */
    961						steps++;
    962						if (copy_page(uffd, offset))
    963							signalled++;
    964					} else {
    965						/* This is a WP request */
    966						assert(steps == 2);
    967						wp_range(uffd,
    968							 (__u64)area_dst +
    969							 offset,
    970							 page_size, false);
    971					}
    972				} else {
    973					signalled++;
    974					continue;
    975				}
    976			}
    977		}
    978
    979		count = *area_count(area_dst, nr);
    980		if (count != count_verify[nr])
    981			err("nr %lu memory corruption %llu %llu\n",
    982			    nr, count, count_verify[nr]);
    983		/*
    984		 * Trigger write protection if there is by writing
    985		 * the same value back.
    986		 */
    987		*area_count(area_dst, nr) = count;
    988	}
    989
    990	if (signal_test)
    991		return signalled != split_nr_pages;
    992
    993	area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
    994			  MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
    995	if (area_dst == MAP_FAILED)
    996		err("mremap");
    997	/* Reset area_src since we just clobbered it */
    998	area_src = NULL;
    999
   1000	for (; nr < nr_pages; nr++) {
   1001		count = *area_count(area_dst, nr);
   1002		if (count != count_verify[nr]) {
   1003			err("nr %lu memory corruption %llu %llu\n",
   1004			    nr, count, count_verify[nr]);
   1005		}
   1006		/*
   1007		 * Trigger write protection if there is by writing
   1008		 * the same value back.
   1009		 */
   1010		*area_count(area_dst, nr) = count;
   1011	}
   1012
   1013	uffd_test_ops->release_pages(area_dst);
   1014
   1015	for (nr = 0; nr < nr_pages; nr++)
   1016		if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
   1017			err("nr %lu is not zero", nr);
   1018
   1019	return 0;
   1020}
   1021
   1022static void retry_uffdio_zeropage(int ufd,
   1023				  struct uffdio_zeropage *uffdio_zeropage,
   1024				  unsigned long offset)
   1025{
   1026	uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
   1027				     uffdio_zeropage->range.len,
   1028				     offset);
   1029	if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
   1030		if (uffdio_zeropage->zeropage != -EEXIST)
   1031			err("UFFDIO_ZEROPAGE error: %"PRId64,
   1032			    (int64_t)uffdio_zeropage->zeropage);
   1033	} else {
   1034		err("UFFDIO_ZEROPAGE error: %"PRId64,
   1035		    (int64_t)uffdio_zeropage->zeropage);
   1036	}
   1037}
   1038
   1039static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
   1040{
   1041	struct uffdio_zeropage uffdio_zeropage;
   1042	int ret;
   1043	bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
   1044	__s64 res;
   1045
   1046	if (offset >= nr_pages * page_size)
   1047		err("unexpected offset %lu", offset);
   1048	uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
   1049	uffdio_zeropage.range.len = page_size;
   1050	uffdio_zeropage.mode = 0;
   1051	ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
   1052	res = uffdio_zeropage.zeropage;
   1053	if (ret) {
   1054		/* real retval in ufdio_zeropage.zeropage */
   1055		if (has_zeropage)
   1056			err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
   1057		else if (res != -EINVAL)
   1058			err("UFFDIO_ZEROPAGE not -EINVAL");
   1059	} else if (has_zeropage) {
   1060		if (res != page_size) {
   1061			err("UFFDIO_ZEROPAGE unexpected size");
   1062		} else {
   1063			if (test_uffdio_zeropage_eexist && retry) {
   1064				test_uffdio_zeropage_eexist = false;
   1065				retry_uffdio_zeropage(ufd, &uffdio_zeropage,
   1066						      offset);
   1067			}
   1068			return 1;
   1069		}
   1070	} else
   1071		err("UFFDIO_ZEROPAGE succeeded");
   1072
   1073	return 0;
   1074}
   1075
   1076static int uffdio_zeropage(int ufd, unsigned long offset)
   1077{
   1078	return __uffdio_zeropage(ufd, offset, false);
   1079}
   1080
   1081/* exercise UFFDIO_ZEROPAGE */
   1082static int userfaultfd_zeropage_test(void)
   1083{
   1084	struct uffdio_register uffdio_register;
   1085
   1086	printf("testing UFFDIO_ZEROPAGE: ");
   1087	fflush(stdout);
   1088
   1089	uffd_test_ctx_init(0);
   1090
   1091	uffdio_register.range.start = (unsigned long) area_dst;
   1092	uffdio_register.range.len = nr_pages * page_size;
   1093	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
   1094	if (test_uffdio_wp)
   1095		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
   1096	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
   1097		err("register failure");
   1098
   1099	assert_expected_ioctls_present(
   1100		uffdio_register.mode, uffdio_register.ioctls);
   1101
   1102	if (uffdio_zeropage(uffd, 0))
   1103		if (my_bcmp(area_dst, zeropage, page_size))
   1104			err("zeropage is not zero");
   1105
   1106	printf("done.\n");
   1107	return 0;
   1108}
   1109
   1110static int userfaultfd_events_test(void)
   1111{
   1112	struct uffdio_register uffdio_register;
   1113	pthread_t uffd_mon;
   1114	int err, features;
   1115	pid_t pid;
   1116	char c;
   1117	struct uffd_stats stats = { 0 };
   1118
   1119	printf("testing events (fork, remap, remove): ");
   1120	fflush(stdout);
   1121
   1122	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
   1123		UFFD_FEATURE_EVENT_REMOVE;
   1124	uffd_test_ctx_init(features);
   1125
   1126	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
   1127
   1128	uffdio_register.range.start = (unsigned long) area_dst;
   1129	uffdio_register.range.len = nr_pages * page_size;
   1130	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
   1131	if (test_uffdio_wp)
   1132		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
   1133	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
   1134		err("register failure");
   1135
   1136	assert_expected_ioctls_present(
   1137		uffdio_register.mode, uffdio_register.ioctls);
   1138
   1139	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
   1140		err("uffd_poll_thread create");
   1141
   1142	pid = fork();
   1143	if (pid < 0)
   1144		err("fork");
   1145
   1146	if (!pid)
   1147		exit(faulting_process(0));
   1148
   1149	waitpid(pid, &err, 0);
   1150	if (err)
   1151		err("faulting process failed");
   1152	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
   1153		err("pipe write");
   1154	if (pthread_join(uffd_mon, NULL))
   1155		return 1;
   1156
   1157	uffd_stats_report(&stats, 1);
   1158
   1159	return stats.missing_faults != nr_pages;
   1160}
   1161
   1162static int userfaultfd_sig_test(void)
   1163{
   1164	struct uffdio_register uffdio_register;
   1165	unsigned long userfaults;
   1166	pthread_t uffd_mon;
   1167	int err, features;
   1168	pid_t pid;
   1169	char c;
   1170	struct uffd_stats stats = { 0 };
   1171
   1172	printf("testing signal delivery: ");
   1173	fflush(stdout);
   1174
   1175	features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
   1176	uffd_test_ctx_init(features);
   1177
   1178	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
   1179
   1180	uffdio_register.range.start = (unsigned long) area_dst;
   1181	uffdio_register.range.len = nr_pages * page_size;
   1182	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
   1183	if (test_uffdio_wp)
   1184		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
   1185	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
   1186		err("register failure");
   1187
   1188	assert_expected_ioctls_present(
   1189		uffdio_register.mode, uffdio_register.ioctls);
   1190
   1191	if (faulting_process(1))
   1192		err("faulting process failed");
   1193
   1194	uffd_test_ops->release_pages(area_dst);
   1195
   1196	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
   1197		err("uffd_poll_thread create");
   1198
   1199	pid = fork();
   1200	if (pid < 0)
   1201		err("fork");
   1202
   1203	if (!pid)
   1204		exit(faulting_process(2));
   1205
   1206	waitpid(pid, &err, 0);
   1207	if (err)
   1208		err("faulting process failed");
   1209	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
   1210		err("pipe write");
   1211	if (pthread_join(uffd_mon, (void **)&userfaults))
   1212		return 1;
   1213
   1214	printf("done.\n");
   1215	if (userfaults)
   1216		err("Signal test failed, userfaults: %ld", userfaults);
   1217
   1218	return userfaults != 0;
   1219}
   1220
   1221static int userfaultfd_minor_test(void)
   1222{
   1223	struct uffdio_register uffdio_register;
   1224	unsigned long p;
   1225	pthread_t uffd_mon;
   1226	uint8_t expected_byte;
   1227	void *expected_page;
   1228	char c;
   1229	struct uffd_stats stats = { 0 };
   1230
   1231	if (!test_uffdio_minor)
   1232		return 0;
   1233
   1234	printf("testing minor faults: ");
   1235	fflush(stdout);
   1236
   1237	uffd_test_ctx_init(uffd_minor_feature());
   1238
   1239	uffdio_register.range.start = (unsigned long)area_dst_alias;
   1240	uffdio_register.range.len = nr_pages * page_size;
   1241	uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
   1242	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
   1243		err("register failure");
   1244
   1245	assert_expected_ioctls_present(
   1246		uffdio_register.mode, uffdio_register.ioctls);
   1247
   1248	/*
   1249	 * After registering with UFFD, populate the non-UFFD-registered side of
   1250	 * the shared mapping. This should *not* trigger any UFFD minor faults.
   1251	 */
   1252	for (p = 0; p < nr_pages; ++p) {
   1253		memset(area_dst + (p * page_size), p % ((uint8_t)-1),
   1254		       page_size);
   1255	}
   1256
   1257	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
   1258		err("uffd_poll_thread create");
   1259
   1260	/*
   1261	 * Read each of the pages back using the UFFD-registered mapping. We
   1262	 * expect that the first time we touch a page, it will result in a minor
   1263	 * fault. uffd_poll_thread will resolve the fault by bit-flipping the
   1264	 * page's contents, and then issuing a CONTINUE ioctl.
   1265	 */
   1266
   1267	if (posix_memalign(&expected_page, page_size, page_size))
   1268		err("out of memory");
   1269
   1270	for (p = 0; p < nr_pages; ++p) {
   1271		expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
   1272		memset(expected_page, expected_byte, page_size);
   1273		if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
   1274			    page_size))
   1275			err("unexpected page contents after minor fault");
   1276	}
   1277
   1278	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
   1279		err("pipe write");
   1280	if (pthread_join(uffd_mon, NULL))
   1281		return 1;
   1282
   1283	uffd_stats_report(&stats, 1);
   1284
   1285	return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
   1286}
   1287
   1288#define BIT_ULL(nr)                   (1ULL << (nr))
   1289#define PM_SOFT_DIRTY                 BIT_ULL(55)
   1290#define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
   1291#define PM_UFFD_WP                    BIT_ULL(57)
   1292#define PM_FILE                       BIT_ULL(61)
   1293#define PM_SWAP                       BIT_ULL(62)
   1294#define PM_PRESENT                    BIT_ULL(63)
   1295
   1296static int pagemap_open(void)
   1297{
   1298	int fd = open("/proc/self/pagemap", O_RDONLY);
   1299
   1300	if (fd < 0)
   1301		err("open pagemap");
   1302
   1303	return fd;
   1304}
   1305
   1306static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
   1307{
   1308	uint64_t value;
   1309	int ret;
   1310
   1311	ret = pread(fd, &value, sizeof(uint64_t),
   1312		    ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
   1313	if (ret != sizeof(uint64_t))
   1314		err("pread() on pagemap failed");
   1315
   1316	return value;
   1317}
   1318
   1319/* This macro let __LINE__ works in err() */
   1320#define  pagemap_check_wp(value, wp) do {				\
   1321		if (!!(value & PM_UFFD_WP) != wp)			\
   1322			err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
   1323	} while (0)
   1324
   1325static int pagemap_test_fork(bool present)
   1326{
   1327	pid_t child = fork();
   1328	uint64_t value;
   1329	int fd, result;
   1330
   1331	if (!child) {
   1332		/* Open the pagemap fd of the child itself */
   1333		fd = pagemap_open();
   1334		value = pagemap_read_vaddr(fd, area_dst);
   1335		/*
   1336		 * After fork() uffd-wp bit should be gone as long as we're
   1337		 * without UFFD_FEATURE_EVENT_FORK
   1338		 */
   1339		pagemap_check_wp(value, false);
   1340		/* Succeed */
   1341		exit(0);
   1342	}
   1343	waitpid(child, &result, 0);
   1344	return result;
   1345}
   1346
   1347static void userfaultfd_pagemap_test(unsigned int test_pgsize)
   1348{
   1349	struct uffdio_register uffdio_register;
   1350	int pagemap_fd;
   1351	uint64_t value;
   1352
   1353	/* Pagemap tests uffd-wp only */
   1354	if (!test_uffdio_wp)
   1355		return;
   1356
   1357	/* Not enough memory to test this page size */
   1358	if (test_pgsize > nr_pages * page_size)
   1359		return;
   1360
   1361	printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
   1362	/* Flush so it doesn't flush twice in parent/child later */
   1363	fflush(stdout);
   1364
   1365	uffd_test_ctx_init(0);
   1366
   1367	if (test_pgsize > page_size) {
   1368		/* This is a thp test */
   1369		if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
   1370			err("madvise(MADV_HUGEPAGE) failed");
   1371	} else if (test_pgsize == page_size) {
   1372		/* This is normal page test; force no thp */
   1373		if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
   1374			err("madvise(MADV_NOHUGEPAGE) failed");
   1375	}
   1376
   1377	uffdio_register.range.start = (unsigned long) area_dst;
   1378	uffdio_register.range.len = nr_pages * page_size;
   1379	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
   1380	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
   1381		err("register failed");
   1382
   1383	pagemap_fd = pagemap_open();
   1384
   1385	/* Touch the page */
   1386	*area_dst = 1;
   1387	wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
   1388	value = pagemap_read_vaddr(pagemap_fd, area_dst);
   1389	pagemap_check_wp(value, true);
   1390	/* Make sure uffd-wp bit dropped when fork */
   1391	if (pagemap_test_fork(true))
   1392		err("Detected stall uffd-wp bit in child");
   1393
   1394	/* Exclusive required or PAGEOUT won't work */
   1395	if (!(value & PM_MMAP_EXCLUSIVE))
   1396		err("multiple mapping detected: 0x%"PRIx64, value);
   1397
   1398	if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
   1399		err("madvise(MADV_PAGEOUT) failed");
   1400
   1401	/* Uffd-wp should persist even swapped out */
   1402	value = pagemap_read_vaddr(pagemap_fd, area_dst);
   1403	pagemap_check_wp(value, true);
   1404	/* Make sure uffd-wp bit dropped when fork */
   1405	if (pagemap_test_fork(false))
   1406		err("Detected stall uffd-wp bit in child");
   1407
   1408	/* Unprotect; this tests swap pte modifications */
   1409	wp_range(uffd, (uint64_t)area_dst, page_size, false);
   1410	value = pagemap_read_vaddr(pagemap_fd, area_dst);
   1411	pagemap_check_wp(value, false);
   1412
   1413	/* Fault in the page from disk */
   1414	*area_dst = 2;
   1415	value = pagemap_read_vaddr(pagemap_fd, area_dst);
   1416	pagemap_check_wp(value, false);
   1417
   1418	close(pagemap_fd);
   1419	printf("done\n");
   1420}
   1421
   1422static int userfaultfd_stress(void)
   1423{
   1424	void *area;
   1425	unsigned long nr;
   1426	struct uffdio_register uffdio_register;
   1427	struct uffd_stats uffd_stats[nr_cpus];
   1428
   1429	uffd_test_ctx_init(0);
   1430
   1431	if (posix_memalign(&area, page_size, page_size))
   1432		err("out of memory");
   1433	zeropage = area;
   1434	bzero(zeropage, page_size);
   1435
   1436	pthread_mutex_lock(&uffd_read_mutex);
   1437
   1438	pthread_attr_init(&attr);
   1439	pthread_attr_setstacksize(&attr, 16*1024*1024);
   1440
   1441	while (bounces--) {
   1442		printf("bounces: %d, mode:", bounces);
   1443		if (bounces & BOUNCE_RANDOM)
   1444			printf(" rnd");
   1445		if (bounces & BOUNCE_RACINGFAULTS)
   1446			printf(" racing");
   1447		if (bounces & BOUNCE_VERIFY)
   1448			printf(" ver");
   1449		if (bounces & BOUNCE_POLL)
   1450			printf(" poll");
   1451		else
   1452			printf(" read");
   1453		printf(", ");
   1454		fflush(stdout);
   1455
   1456		if (bounces & BOUNCE_POLL)
   1457			fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
   1458		else
   1459			fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
   1460
   1461		/* register */
   1462		uffdio_register.range.start = (unsigned long) area_dst;
   1463		uffdio_register.range.len = nr_pages * page_size;
   1464		uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
   1465		if (test_uffdio_wp)
   1466			uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
   1467		if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
   1468			err("register failure");
   1469		assert_expected_ioctls_present(
   1470			uffdio_register.mode, uffdio_register.ioctls);
   1471
   1472		if (area_dst_alias) {
   1473			uffdio_register.range.start = (unsigned long)
   1474				area_dst_alias;
   1475			if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
   1476				err("register failure alias");
   1477		}
   1478
   1479		/*
   1480		 * The madvise done previously isn't enough: some
   1481		 * uffd_thread could have read userfaults (one of
   1482		 * those already resolved by the background thread)
   1483		 * and it may be in the process of calling
   1484		 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
   1485		 * area_src and it would map a zero page in it (of
   1486		 * course such a UFFDIO_COPY is perfectly safe as it'd
   1487		 * return -EEXIST). The problem comes at the next
   1488		 * bounce though: that racing UFFDIO_COPY would
   1489		 * generate zeropages in the area_src, so invalidating
   1490		 * the previous MADV_DONTNEED. Without this additional
   1491		 * MADV_DONTNEED those zeropages leftovers in the
   1492		 * area_src would lead to -EEXIST failure during the
   1493		 * next bounce, effectively leaving a zeropage in the
   1494		 * area_dst.
   1495		 *
   1496		 * Try to comment this out madvise to see the memory
   1497		 * corruption being caught pretty quick.
   1498		 *
   1499		 * khugepaged is also inhibited to collapse THP after
   1500		 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
   1501		 * required to MADV_DONTNEED here.
   1502		 */
   1503		uffd_test_ops->release_pages(area_dst);
   1504
   1505		uffd_stats_reset(uffd_stats, nr_cpus);
   1506
   1507		/* bounce pass */
   1508		if (stress(uffd_stats))
   1509			return 1;
   1510
   1511		/* Clear all the write protections if there is any */
   1512		if (test_uffdio_wp)
   1513			wp_range(uffd, (unsigned long)area_dst,
   1514				 nr_pages * page_size, false);
   1515
   1516		/* unregister */
   1517		if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
   1518			err("unregister failure");
   1519		if (area_dst_alias) {
   1520			uffdio_register.range.start = (unsigned long) area_dst;
   1521			if (ioctl(uffd, UFFDIO_UNREGISTER,
   1522				  &uffdio_register.range))
   1523				err("unregister failure alias");
   1524		}
   1525
   1526		/* verification */
   1527		if (bounces & BOUNCE_VERIFY)
   1528			for (nr = 0; nr < nr_pages; nr++)
   1529				if (*area_count(area_dst, nr) != count_verify[nr])
   1530					err("error area_count %llu %llu %lu\n",
   1531					    *area_count(area_src, nr),
   1532					    count_verify[nr], nr);
   1533
   1534		/* prepare next bounce */
   1535		swap(area_src, area_dst);
   1536
   1537		swap(area_src_alias, area_dst_alias);
   1538
   1539		uffd_stats_report(uffd_stats, nr_cpus);
   1540	}
   1541
   1542	if (test_type == TEST_ANON) {
   1543		/*
   1544		 * shmem/hugetlb won't be able to run since they have different
   1545		 * behavior on fork() (file-backed memory normally drops ptes
   1546		 * directly when fork), meanwhile the pagemap test will verify
   1547		 * pgtable entry of fork()ed child.
   1548		 */
   1549		userfaultfd_pagemap_test(page_size);
   1550		/*
   1551		 * Hard-code for x86_64 for now for 2M THP, as x86_64 is
   1552		 * currently the only one that supports uffd-wp
   1553		 */
   1554		userfaultfd_pagemap_test(page_size * 512);
   1555	}
   1556
   1557	return userfaultfd_zeropage_test() || userfaultfd_sig_test()
   1558		|| userfaultfd_events_test() || userfaultfd_minor_test();
   1559}
   1560
   1561/*
   1562 * Copied from mlock2-tests.c
   1563 */
   1564unsigned long default_huge_page_size(void)
   1565{
   1566	unsigned long hps = 0;
   1567	char *line = NULL;
   1568	size_t linelen = 0;
   1569	FILE *f = fopen("/proc/meminfo", "r");
   1570
   1571	if (!f)
   1572		return 0;
   1573	while (getline(&line, &linelen, f) > 0) {
   1574		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
   1575			hps <<= 10;
   1576			break;
   1577		}
   1578	}
   1579
   1580	free(line);
   1581	fclose(f);
   1582	return hps;
   1583}
   1584
   1585static void set_test_type(const char *type)
   1586{
   1587	uint64_t features = UFFD_API_FEATURES;
   1588
   1589	if (!strcmp(type, "anon")) {
   1590		test_type = TEST_ANON;
   1591		uffd_test_ops = &anon_uffd_test_ops;
   1592	} else if (!strcmp(type, "hugetlb")) {
   1593		test_type = TEST_HUGETLB;
   1594		uffd_test_ops = &hugetlb_uffd_test_ops;
   1595	} else if (!strcmp(type, "hugetlb_shared")) {
   1596		map_shared = true;
   1597		test_type = TEST_HUGETLB;
   1598		uffd_test_ops = &hugetlb_uffd_test_ops;
   1599		/* Minor faults require shared hugetlb; only enable here. */
   1600		test_uffdio_minor = true;
   1601	} else if (!strcmp(type, "shmem")) {
   1602		map_shared = true;
   1603		test_type = TEST_SHMEM;
   1604		uffd_test_ops = &shmem_uffd_test_ops;
   1605		test_uffdio_minor = true;
   1606	} else {
   1607		err("Unknown test type: %s", type);
   1608	}
   1609
   1610	if (test_type == TEST_HUGETLB)
   1611		page_size = default_huge_page_size();
   1612	else
   1613		page_size = sysconf(_SC_PAGE_SIZE);
   1614
   1615	if (!page_size)
   1616		err("Unable to determine page size");
   1617	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
   1618	    > page_size)
   1619		err("Impossible to run this test");
   1620
   1621	/*
   1622	 * Whether we can test certain features depends not just on test type,
   1623	 * but also on whether or not this particular kernel supports the
   1624	 * feature.
   1625	 */
   1626
   1627	userfaultfd_open(&features);
   1628
   1629	test_uffdio_wp = test_uffdio_wp &&
   1630		(features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
   1631	test_uffdio_minor = test_uffdio_minor &&
   1632		(features & uffd_minor_feature());
   1633
   1634	close(uffd);
   1635	uffd = -1;
   1636}
   1637
   1638static void sigalrm(int sig)
   1639{
   1640	if (sig != SIGALRM)
   1641		abort();
   1642	test_uffdio_copy_eexist = true;
   1643	test_uffdio_zeropage_eexist = true;
   1644	alarm(ALARM_INTERVAL_SECS);
   1645}
   1646
   1647int main(int argc, char **argv)
   1648{
   1649	if (argc < 4)
   1650		usage();
   1651
   1652	if (signal(SIGALRM, sigalrm) == SIG_ERR)
   1653		err("failed to arm SIGALRM");
   1654	alarm(ALARM_INTERVAL_SECS);
   1655
   1656	set_test_type(argv[1]);
   1657
   1658	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
   1659	nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
   1660		nr_cpus;
   1661	if (!nr_pages_per_cpu) {
   1662		_err("invalid MiB");
   1663		usage();
   1664	}
   1665
   1666	bounces = atoi(argv[3]);
   1667	if (bounces <= 0) {
   1668		_err("invalid bounces");
   1669		usage();
   1670	}
   1671	nr_pages = nr_pages_per_cpu * nr_cpus;
   1672
   1673	if (test_type == TEST_HUGETLB && map_shared) {
   1674		if (argc < 5)
   1675			usage();
   1676		huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
   1677		if (huge_fd < 0)
   1678			err("Open of %s failed", argv[4]);
   1679		if (ftruncate(huge_fd, 0))
   1680			err("ftruncate %s to size 0 failed", argv[4]);
   1681	} else if (test_type == TEST_SHMEM) {
   1682		shm_fd = memfd_create(argv[0], 0);
   1683		if (shm_fd < 0)
   1684			err("memfd_create");
   1685		if (ftruncate(shm_fd, nr_pages * page_size * 2))
   1686			err("ftruncate");
   1687		if (fallocate(shm_fd,
   1688			      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
   1689			      nr_pages * page_size * 2))
   1690			err("fallocate");
   1691	}
   1692	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
   1693	       nr_pages, nr_pages_per_cpu);
   1694	return userfaultfd_stress();
   1695}
   1696
   1697#else /* __NR_userfaultfd */
   1698
   1699#warning "missing __NR_userfaultfd definition"
   1700
   1701int main(void)
   1702{
   1703	printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
   1704	return KSFT_SKIP;
   1705}
   1706
   1707#endif /* __NR_userfaultfd */