cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

protection_keys.c (43446B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
      4 *
      5 * There are examples in here of:
      6 *  * how to set protection keys on memory
      7 *  * how to set/clear bits in pkey registers (the rights register)
      8 *  * how to handle SEGV_PKUERR signals and extract pkey-relevant
      9 *    information from the siginfo
     10 *
     11 * Things to add:
     12 *	make sure KSM and KSM COW breaking works
     13 *	prefault pages in at malloc, or not
     14 *	protect MPX bounds tables with protection keys?
     15 *	make sure VMA splitting/merging is working correctly
     16 *	OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
     17 *	look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
     18 *	do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
     19 *
     20 * Compile like this:
     21 *	gcc      -o protection_keys    -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
     22 *	gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
     23 */
     24#define _GNU_SOURCE
     25#define __SANE_USERSPACE_TYPES__
     26#include <errno.h>
     27#include <linux/futex.h>
     28#include <time.h>
     29#include <sys/time.h>
     30#include <sys/syscall.h>
     31#include <string.h>
     32#include <stdio.h>
     33#include <stdint.h>
     34#include <stdbool.h>
     35#include <signal.h>
     36#include <assert.h>
     37#include <stdlib.h>
     38#include <ucontext.h>
     39#include <sys/mman.h>
     40#include <sys/types.h>
     41#include <sys/wait.h>
     42#include <sys/stat.h>
     43#include <fcntl.h>
     44#include <unistd.h>
     45#include <sys/ptrace.h>
     46#include <setjmp.h>
     47
     48#include "pkey-helpers.h"
     49
     50int iteration_nr = 1;
     51int test_nr;
     52
     53u64 shadow_pkey_reg;
     54int dprint_in_signal;
     55char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
     56
     57void cat_into_file(char *str, char *file)
     58{
     59	int fd = open(file, O_RDWR);
     60	int ret;
     61
     62	dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
     63	/*
     64	 * these need to be raw because they are called under
     65	 * pkey_assert()
     66	 */
     67	if (fd < 0) {
     68		fprintf(stderr, "error opening '%s'\n", str);
     69		perror("error: ");
     70		exit(__LINE__);
     71	}
     72
     73	ret = write(fd, str, strlen(str));
     74	if (ret != strlen(str)) {
     75		perror("write to file failed");
     76		fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
     77		exit(__LINE__);
     78	}
     79	close(fd);
     80}
     81
     82#if CONTROL_TRACING > 0
     83static int warned_tracing;
     84int tracing_root_ok(void)
     85{
     86	if (geteuid() != 0) {
     87		if (!warned_tracing)
     88			fprintf(stderr, "WARNING: not run as root, "
     89					"can not do tracing control\n");
     90		warned_tracing = 1;
     91		return 0;
     92	}
     93	return 1;
     94}
     95#endif
     96
     97void tracing_on(void)
     98{
     99#if CONTROL_TRACING > 0
    100#define TRACEDIR "/sys/kernel/debug/tracing"
    101	char pidstr[32];
    102
    103	if (!tracing_root_ok())
    104		return;
    105
    106	sprintf(pidstr, "%d", getpid());
    107	cat_into_file("0", TRACEDIR "/tracing_on");
    108	cat_into_file("\n", TRACEDIR "/trace");
    109	if (1) {
    110		cat_into_file("function_graph", TRACEDIR "/current_tracer");
    111		cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
    112	} else {
    113		cat_into_file("nop", TRACEDIR "/current_tracer");
    114	}
    115	cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
    116	cat_into_file("1", TRACEDIR "/tracing_on");
    117	dprintf1("enabled tracing\n");
    118#endif
    119}
    120
    121void tracing_off(void)
    122{
    123#if CONTROL_TRACING > 0
    124	if (!tracing_root_ok())
    125		return;
    126	cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
    127#endif
    128}
    129
    130void abort_hooks(void)
    131{
    132	fprintf(stderr, "running %s()...\n", __func__);
    133	tracing_off();
    134#ifdef SLEEP_ON_ABORT
    135	sleep(SLEEP_ON_ABORT);
    136#endif
    137}
    138
    139/*
    140 * This attempts to have roughly a page of instructions followed by a few
    141 * instructions that do a write, and another page of instructions.  That
    142 * way, we are pretty sure that the write is in the second page of
    143 * instructions and has at least a page of padding behind it.
    144 *
    145 * *That* lets us be sure to madvise() away the write instruction, which
    146 * will then fault, which makes sure that the fault code handles
    147 * execute-only memory properly.
    148 */
    149#ifdef __powerpc64__
    150/* This way, both 4K and 64K alignment are maintained */
    151__attribute__((__aligned__(65536)))
    152#else
    153__attribute__((__aligned__(PAGE_SIZE)))
    154#endif
    155void lots_o_noops_around_write(int *write_to_me)
    156{
    157	dprintf3("running %s()\n", __func__);
    158	__page_o_noops();
    159	/* Assume this happens in the second page of instructions: */
    160	*write_to_me = __LINE__;
    161	/* pad out by another page: */
    162	__page_o_noops();
    163	dprintf3("%s() done\n", __func__);
    164}
    165
    166void dump_mem(void *dumpme, int len_bytes)
    167{
    168	char *c = (void *)dumpme;
    169	int i;
    170
    171	for (i = 0; i < len_bytes; i += sizeof(u64)) {
    172		u64 *ptr = (u64 *)(c + i);
    173		dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr);
    174	}
    175}
    176
    177static u32 hw_pkey_get(int pkey, unsigned long flags)
    178{
    179	u64 pkey_reg = __read_pkey_reg();
    180
    181	dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
    182			__func__, pkey, flags, 0, 0);
    183	dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg);
    184
    185	return (u32) get_pkey_bits(pkey_reg, pkey);
    186}
    187
    188static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
    189{
    190	u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
    191	u64 old_pkey_reg = __read_pkey_reg();
    192	u64 new_pkey_reg;
    193
    194	/* make sure that 'rights' only contains the bits we expect: */
    195	assert(!(rights & ~mask));
    196
    197	/* modify bits accordingly in old pkey_reg and assign it */
    198	new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights);
    199
    200	__write_pkey_reg(new_pkey_reg);
    201
    202	dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x"
    203		" pkey_reg now: %016llx old_pkey_reg: %016llx\n",
    204		__func__, pkey, rights, flags, 0, __read_pkey_reg(),
    205		old_pkey_reg);
    206	return 0;
    207}
    208
    209void pkey_disable_set(int pkey, int flags)
    210{
    211	unsigned long syscall_flags = 0;
    212	int ret;
    213	int pkey_rights;
    214	u64 orig_pkey_reg = read_pkey_reg();
    215
    216	dprintf1("START->%s(%d, 0x%x)\n", __func__,
    217		pkey, flags);
    218	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
    219
    220	pkey_rights = hw_pkey_get(pkey, syscall_flags);
    221
    222	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
    223			pkey, pkey, pkey_rights);
    224
    225	pkey_assert(pkey_rights >= 0);
    226
    227	pkey_rights |= flags;
    228
    229	ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
    230	assert(!ret);
    231	/* pkey_reg and flags have the same format */
    232	shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
    233	dprintf1("%s(%d) shadow: 0x%016llx\n",
    234		__func__, pkey, shadow_pkey_reg);
    235
    236	pkey_assert(ret >= 0);
    237
    238	pkey_rights = hw_pkey_get(pkey, syscall_flags);
    239	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
    240			pkey, pkey, pkey_rights);
    241
    242	dprintf1("%s(%d) pkey_reg: 0x%016llx\n",
    243		__func__, pkey, read_pkey_reg());
    244	if (flags)
    245		pkey_assert(read_pkey_reg() >= orig_pkey_reg);
    246	dprintf1("END<---%s(%d, 0x%x)\n", __func__,
    247		pkey, flags);
    248}
    249
    250void pkey_disable_clear(int pkey, int flags)
    251{
    252	unsigned long syscall_flags = 0;
    253	int ret;
    254	int pkey_rights = hw_pkey_get(pkey, syscall_flags);
    255	u64 orig_pkey_reg = read_pkey_reg();
    256
    257	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
    258
    259	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
    260			pkey, pkey, pkey_rights);
    261	pkey_assert(pkey_rights >= 0);
    262
    263	pkey_rights &= ~flags;
    264
    265	ret = hw_pkey_set(pkey, pkey_rights, 0);
    266	shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
    267	pkey_assert(ret >= 0);
    268
    269	pkey_rights = hw_pkey_get(pkey, syscall_flags);
    270	dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
    271			pkey, pkey, pkey_rights);
    272
    273	dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__,
    274			pkey, read_pkey_reg());
    275	if (flags)
    276		assert(read_pkey_reg() <= orig_pkey_reg);
    277}
    278
    279void pkey_write_allow(int pkey)
    280{
    281	pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
    282}
    283void pkey_write_deny(int pkey)
    284{
    285	pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
    286}
    287void pkey_access_allow(int pkey)
    288{
    289	pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
    290}
    291void pkey_access_deny(int pkey)
    292{
    293	pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
    294}
    295
    296/* Failed address bound checks: */
    297#ifndef SEGV_BNDERR
    298# define SEGV_BNDERR		3
    299#endif
    300
    301#ifndef SEGV_PKUERR
    302# define SEGV_PKUERR		4
    303#endif
    304
    305static char *si_code_str(int si_code)
    306{
    307	if (si_code == SEGV_MAPERR)
    308		return "SEGV_MAPERR";
    309	if (si_code == SEGV_ACCERR)
    310		return "SEGV_ACCERR";
    311	if (si_code == SEGV_BNDERR)
    312		return "SEGV_BNDERR";
    313	if (si_code == SEGV_PKUERR)
    314		return "SEGV_PKUERR";
    315	return "UNKNOWN";
    316}
    317
    318int pkey_faults;
    319int last_si_pkey = -1;
    320void signal_handler(int signum, siginfo_t *si, void *vucontext)
    321{
    322	ucontext_t *uctxt = vucontext;
    323	int trapno;
    324	unsigned long ip;
    325	char *fpregs;
    326#if defined(__i386__) || defined(__x86_64__) /* arch */
    327	u32 *pkey_reg_ptr;
    328	int pkey_reg_offset;
    329#endif /* arch */
    330	u64 siginfo_pkey;
    331	u32 *si_pkey_ptr;
    332
    333	dprint_in_signal = 1;
    334	dprintf1(">>>>===============SIGSEGV============================\n");
    335	dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
    336			__func__, __LINE__,
    337			__read_pkey_reg(), shadow_pkey_reg);
    338
    339	trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
    340	ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
    341	fpregs = (char *) uctxt->uc_mcontext.fpregs;
    342
    343	dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
    344			__func__, trapno, ip, si_code_str(si->si_code),
    345			si->si_code);
    346
    347#if defined(__i386__) || defined(__x86_64__) /* arch */
    348#ifdef __i386__
    349	/*
    350	 * 32-bit has some extra padding so that userspace can tell whether
    351	 * the XSTATE header is present in addition to the "legacy" FPU
    352	 * state.  We just assume that it is here.
    353	 */
    354	fpregs += 0x70;
    355#endif /* i386 */
    356	pkey_reg_offset = pkey_reg_xstate_offset();
    357	pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]);
    358
    359	/*
    360	 * If we got a PKEY fault, we *HAVE* to have at least one bit set in
    361	 * here.
    362	 */
    363	dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset());
    364	if (DEBUG_LEVEL > 4)
    365		dump_mem(pkey_reg_ptr - 128, 256);
    366	pkey_assert(*pkey_reg_ptr);
    367#endif /* arch */
    368
    369	dprintf1("siginfo: %p\n", si);
    370	dprintf1(" fpregs: %p\n", fpregs);
    371
    372	if ((si->si_code == SEGV_MAPERR) ||
    373	    (si->si_code == SEGV_ACCERR) ||
    374	    (si->si_code == SEGV_BNDERR)) {
    375		printf("non-PK si_code, exiting...\n");
    376		exit(4);
    377	}
    378
    379	si_pkey_ptr = siginfo_get_pkey_ptr(si);
    380	dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
    381	dump_mem((u8 *)si_pkey_ptr - 8, 24);
    382	siginfo_pkey = *si_pkey_ptr;
    383	pkey_assert(siginfo_pkey < NR_PKEYS);
    384	last_si_pkey = siginfo_pkey;
    385
    386	/*
    387	 * need __read_pkey_reg() version so we do not do shadow_pkey_reg
    388	 * checking
    389	 */
    390	dprintf1("signal pkey_reg from  pkey_reg: %016llx\n",
    391			__read_pkey_reg());
    392	dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey);
    393#if defined(__i386__) || defined(__x86_64__) /* arch */
    394	dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
    395	*(u64 *)pkey_reg_ptr = 0x00000000;
    396	dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n");
    397#elif defined(__powerpc64__) /* arch */
    398	/* restore access and let the faulting instruction continue */
    399	pkey_access_allow(siginfo_pkey);
    400#endif /* arch */
    401	pkey_faults++;
    402	dprintf1("<<<<==================================================\n");
    403	dprint_in_signal = 0;
    404}
    405
    406int wait_all_children(void)
    407{
    408	int status;
    409	return waitpid(-1, &status, 0);
    410}
    411
    412void sig_chld(int x)
    413{
    414	dprint_in_signal = 1;
    415	dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
    416	dprint_in_signal = 0;
    417}
    418
    419void setup_sigsegv_handler(void)
    420{
    421	int r, rs;
    422	struct sigaction newact;
    423	struct sigaction oldact;
    424
    425	/* #PF is mapped to sigsegv */
    426	int signum  = SIGSEGV;
    427
    428	newact.sa_handler = 0;
    429	newact.sa_sigaction = signal_handler;
    430
    431	/*sigset_t - signals to block while in the handler */
    432	/* get the old signal mask. */
    433	rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
    434	pkey_assert(rs == 0);
    435
    436	/* call sa_sigaction, not sa_handler*/
    437	newact.sa_flags = SA_SIGINFO;
    438
    439	newact.sa_restorer = 0;  /* void(*)(), obsolete */
    440	r = sigaction(signum, &newact, &oldact);
    441	r = sigaction(SIGALRM, &newact, &oldact);
    442	pkey_assert(r == 0);
    443}
    444
    445void setup_handlers(void)
    446{
    447	signal(SIGCHLD, &sig_chld);
    448	setup_sigsegv_handler();
    449}
    450
    451pid_t fork_lazy_child(void)
    452{
    453	pid_t forkret;
    454
    455	forkret = fork();
    456	pkey_assert(forkret >= 0);
    457	dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
    458
    459	if (!forkret) {
    460		/* in the child */
    461		while (1) {
    462			dprintf1("child sleeping...\n");
    463			sleep(30);
    464		}
    465	}
    466	return forkret;
    467}
    468
    469int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
    470		unsigned long pkey)
    471{
    472	int sret;
    473
    474	dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
    475			ptr, size, orig_prot, pkey);
    476
    477	errno = 0;
    478	sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
    479	if (errno) {
    480		dprintf2("SYS_mprotect_key sret: %d\n", sret);
    481		dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
    482		dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
    483		if (DEBUG_LEVEL >= 2)
    484			perror("SYS_mprotect_pkey");
    485	}
    486	return sret;
    487}
    488
    489int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
    490{
    491	int ret = syscall(SYS_pkey_alloc, flags, init_val);
    492	dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
    493			__func__, flags, init_val, ret, errno);
    494	return ret;
    495}
    496
    497int alloc_pkey(void)
    498{
    499	int ret;
    500	unsigned long init_val = 0x0;
    501
    502	dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
    503			__func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
    504	ret = sys_pkey_alloc(0, init_val);
    505	/*
    506	 * pkey_alloc() sets PKEY register, so we need to reflect it in
    507	 * shadow_pkey_reg:
    508	 */
    509	dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
    510			" shadow: 0x%016llx\n",
    511			__func__, __LINE__, ret, __read_pkey_reg(),
    512			shadow_pkey_reg);
    513	if (ret > 0) {
    514		/* clear both the bits: */
    515		shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
    516						~PKEY_MASK);
    517		dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
    518				" shadow: 0x%016llx\n",
    519				__func__,
    520				__LINE__, ret, __read_pkey_reg(),
    521				shadow_pkey_reg);
    522		/*
    523		 * move the new state in from init_val
    524		 * (remember, we cheated and init_val == pkey_reg format)
    525		 */
    526		shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
    527						init_val);
    528	}
    529	dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
    530			" shadow: 0x%016llx\n",
    531			__func__, __LINE__, ret, __read_pkey_reg(),
    532			shadow_pkey_reg);
    533	dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno);
    534	/* for shadow checking: */
    535	read_pkey_reg();
    536	dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
    537		 " shadow: 0x%016llx\n",
    538		__func__, __LINE__, ret, __read_pkey_reg(),
    539		shadow_pkey_reg);
    540	return ret;
    541}
    542
    543int sys_pkey_free(unsigned long pkey)
    544{
    545	int ret = syscall(SYS_pkey_free, pkey);
    546	dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
    547	return ret;
    548}
    549
    550/*
    551 * I had a bug where pkey bits could be set by mprotect() but
    552 * not cleared.  This ensures we get lots of random bit sets
    553 * and clears on the vma and pte pkey bits.
    554 */
    555int alloc_random_pkey(void)
    556{
    557	int max_nr_pkey_allocs;
    558	int ret;
    559	int i;
    560	int alloced_pkeys[NR_PKEYS];
    561	int nr_alloced = 0;
    562	int random_index;
    563	memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
    564
    565	/* allocate every possible key and make a note of which ones we got */
    566	max_nr_pkey_allocs = NR_PKEYS;
    567	for (i = 0; i < max_nr_pkey_allocs; i++) {
    568		int new_pkey = alloc_pkey();
    569		if (new_pkey < 0)
    570			break;
    571		alloced_pkeys[nr_alloced++] = new_pkey;
    572	}
    573
    574	pkey_assert(nr_alloced > 0);
    575	/* select a random one out of the allocated ones */
    576	random_index = rand() % nr_alloced;
    577	ret = alloced_pkeys[random_index];
    578	/* now zero it out so we don't free it next */
    579	alloced_pkeys[random_index] = 0;
    580
    581	/* go through the allocated ones that we did not want and free them */
    582	for (i = 0; i < nr_alloced; i++) {
    583		int free_ret;
    584		if (!alloced_pkeys[i])
    585			continue;
    586		free_ret = sys_pkey_free(alloced_pkeys[i]);
    587		pkey_assert(!free_ret);
    588	}
    589	dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
    590			 " shadow: 0x%016llx\n", __func__,
    591			__LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
    592	return ret;
    593}
    594
    595int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
    596		unsigned long pkey)
    597{
    598	int nr_iterations = random() % 100;
    599	int ret;
    600
    601	while (0) {
    602		int rpkey = alloc_random_pkey();
    603		ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
    604		dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
    605				ptr, size, orig_prot, pkey, ret);
    606		if (nr_iterations-- < 0)
    607			break;
    608
    609		dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
    610			" shadow: 0x%016llx\n",
    611			__func__, __LINE__, ret, __read_pkey_reg(),
    612			shadow_pkey_reg);
    613		sys_pkey_free(rpkey);
    614		dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
    615			" shadow: 0x%016llx\n",
    616			__func__, __LINE__, ret, __read_pkey_reg(),
    617			shadow_pkey_reg);
    618	}
    619	pkey_assert(pkey < NR_PKEYS);
    620
    621	ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
    622	dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
    623			ptr, size, orig_prot, pkey, ret);
    624	pkey_assert(!ret);
    625	dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
    626			" shadow: 0x%016llx\n", __func__,
    627			__LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
    628	return ret;
    629}
    630
    631struct pkey_malloc_record {
    632	void *ptr;
    633	long size;
    634	int prot;
    635};
    636struct pkey_malloc_record *pkey_malloc_records;
    637struct pkey_malloc_record *pkey_last_malloc_record;
    638long nr_pkey_malloc_records;
    639void record_pkey_malloc(void *ptr, long size, int prot)
    640{
    641	long i;
    642	struct pkey_malloc_record *rec = NULL;
    643
    644	for (i = 0; i < nr_pkey_malloc_records; i++) {
    645		rec = &pkey_malloc_records[i];
    646		/* find a free record */
    647		if (rec)
    648			break;
    649	}
    650	if (!rec) {
    651		/* every record is full */
    652		size_t old_nr_records = nr_pkey_malloc_records;
    653		size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
    654		size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
    655		dprintf2("new_nr_records: %zd\n", new_nr_records);
    656		dprintf2("new_size: %zd\n", new_size);
    657		pkey_malloc_records = realloc(pkey_malloc_records, new_size);
    658		pkey_assert(pkey_malloc_records != NULL);
    659		rec = &pkey_malloc_records[nr_pkey_malloc_records];
    660		/*
    661		 * realloc() does not initialize memory, so zero it from
    662		 * the first new record all the way to the end.
    663		 */
    664		for (i = 0; i < new_nr_records - old_nr_records; i++)
    665			memset(rec + i, 0, sizeof(*rec));
    666	}
    667	dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
    668		(int)(rec - pkey_malloc_records), rec, ptr, size);
    669	rec->ptr = ptr;
    670	rec->size = size;
    671	rec->prot = prot;
    672	pkey_last_malloc_record = rec;
    673	nr_pkey_malloc_records++;
    674}
    675
    676void free_pkey_malloc(void *ptr)
    677{
    678	long i;
    679	int ret;
    680	dprintf3("%s(%p)\n", __func__, ptr);
    681	for (i = 0; i < nr_pkey_malloc_records; i++) {
    682		struct pkey_malloc_record *rec = &pkey_malloc_records[i];
    683		dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
    684				ptr, i, rec, rec->ptr, rec->size);
    685		if ((ptr <  rec->ptr) ||
    686		    (ptr >= rec->ptr + rec->size))
    687			continue;
    688
    689		dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
    690				ptr, i, rec, rec->ptr, rec->size);
    691		nr_pkey_malloc_records--;
    692		ret = munmap(rec->ptr, rec->size);
    693		dprintf3("munmap ret: %d\n", ret);
    694		pkey_assert(!ret);
    695		dprintf3("clearing rec->ptr, rec: %p\n", rec);
    696		rec->ptr = NULL;
    697		dprintf3("done clearing rec->ptr, rec: %p\n", rec);
    698		return;
    699	}
    700	pkey_assert(false);
    701}
    702
    703
    704void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
    705{
    706	void *ptr;
    707	int ret;
    708
    709	read_pkey_reg();
    710	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
    711			size, prot, pkey);
    712	pkey_assert(pkey < NR_PKEYS);
    713	ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
    714	pkey_assert(ptr != (void *)-1);
    715	ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
    716	pkey_assert(!ret);
    717	record_pkey_malloc(ptr, size, prot);
    718	read_pkey_reg();
    719
    720	dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
    721	return ptr;
    722}
    723
    724void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
    725{
    726	int ret;
    727	void *ptr;
    728
    729	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
    730			size, prot, pkey);
    731	/*
    732	 * Guarantee we can fit at least one huge page in the resulting
    733	 * allocation by allocating space for 2:
    734	 */
    735	size = ALIGN_UP(size, HPAGE_SIZE * 2);
    736	ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
    737	pkey_assert(ptr != (void *)-1);
    738	record_pkey_malloc(ptr, size, prot);
    739	mprotect_pkey(ptr, size, prot, pkey);
    740
    741	dprintf1("unaligned ptr: %p\n", ptr);
    742	ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
    743	dprintf1("  aligned ptr: %p\n", ptr);
    744	ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
    745	dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
    746	ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
    747	dprintf1("MADV_WILLNEED ret: %d\n", ret);
    748	memset(ptr, 0, HPAGE_SIZE);
    749
    750	dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
    751	return ptr;
    752}
    753
    754int hugetlb_setup_ok;
    755#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
    756#define GET_NR_HUGE_PAGES 10
    757void setup_hugetlbfs(void)
    758{
    759	int err;
    760	int fd;
    761	char buf[256];
    762	long hpagesz_kb;
    763	long hpagesz_mb;
    764
    765	if (geteuid() != 0) {
    766		fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
    767		return;
    768	}
    769
    770	cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
    771
    772	/*
    773	 * Now go make sure that we got the pages and that they
    774	 * are PMD-level pages. Someone might have made PUD-level
    775	 * pages the default.
    776	 */
    777	hpagesz_kb = HPAGE_SIZE / 1024;
    778	hpagesz_mb = hpagesz_kb / 1024;
    779	sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb);
    780	fd = open(buf, O_RDONLY);
    781	if (fd < 0) {
    782		fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n",
    783			hpagesz_mb, strerror(errno));
    784		return;
    785	}
    786
    787	/* -1 to guarantee leaving the trailing \0 */
    788	err = read(fd, buf, sizeof(buf)-1);
    789	close(fd);
    790	if (err <= 0) {
    791		fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n",
    792			hpagesz_mb, strerror(errno));
    793		return;
    794	}
    795
    796	if (atoi(buf) != GET_NR_HUGE_PAGES) {
    797		fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n",
    798			hpagesz_mb, buf, GET_NR_HUGE_PAGES);
    799		return;
    800	}
    801
    802	hugetlb_setup_ok = 1;
    803}
    804
    805void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
    806{
    807	void *ptr;
    808	int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
    809
    810	if (!hugetlb_setup_ok)
    811		return PTR_ERR_ENOTSUP;
    812
    813	dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
    814	size = ALIGN_UP(size, HPAGE_SIZE * 2);
    815	pkey_assert(pkey < NR_PKEYS);
    816	ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
    817	pkey_assert(ptr != (void *)-1);
    818	mprotect_pkey(ptr, size, prot, pkey);
    819
    820	record_pkey_malloc(ptr, size, prot);
    821
    822	dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
    823	return ptr;
    824}
    825
    826void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
    827{
    828	void *ptr;
    829	int fd;
    830
    831	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
    832			size, prot, pkey);
    833	pkey_assert(pkey < NR_PKEYS);
    834	fd = open("/dax/foo", O_RDWR);
    835	pkey_assert(fd >= 0);
    836
    837	ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
    838	pkey_assert(ptr != (void *)-1);
    839
    840	mprotect_pkey(ptr, size, prot, pkey);
    841
    842	record_pkey_malloc(ptr, size, prot);
    843
    844	dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
    845	close(fd);
    846	return ptr;
    847}
    848
    849void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
    850
    851	malloc_pkey_with_mprotect,
    852	malloc_pkey_with_mprotect_subpage,
    853	malloc_pkey_anon_huge,
    854	malloc_pkey_hugetlb
    855/* can not do direct with the pkey_mprotect() API:
    856	malloc_pkey_mmap_direct,
    857	malloc_pkey_mmap_dax,
    858*/
    859};
    860
    861void *malloc_pkey(long size, int prot, u16 pkey)
    862{
    863	void *ret;
    864	static int malloc_type;
    865	int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
    866
    867	pkey_assert(pkey < NR_PKEYS);
    868
    869	while (1) {
    870		pkey_assert(malloc_type < nr_malloc_types);
    871
    872		ret = pkey_malloc[malloc_type](size, prot, pkey);
    873		pkey_assert(ret != (void *)-1);
    874
    875		malloc_type++;
    876		if (malloc_type >= nr_malloc_types)
    877			malloc_type = (random()%nr_malloc_types);
    878
    879		/* try again if the malloc_type we tried is unsupported */
    880		if (ret == PTR_ERR_ENOTSUP)
    881			continue;
    882
    883		break;
    884	}
    885
    886	dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
    887			size, prot, pkey, ret);
    888	return ret;
    889}
    890
    891int last_pkey_faults;
    892#define UNKNOWN_PKEY -2
    893void expected_pkey_fault(int pkey)
    894{
    895	dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n",
    896			__func__, last_pkey_faults, pkey_faults);
    897	dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
    898	pkey_assert(last_pkey_faults + 1 == pkey_faults);
    899
    900       /*
    901	* For exec-only memory, we do not know the pkey in
    902	* advance, so skip this check.
    903	*/
    904	if (pkey != UNKNOWN_PKEY)
    905		pkey_assert(last_si_pkey == pkey);
    906
    907#if defined(__i386__) || defined(__x86_64__) /* arch */
    908	/*
    909	 * The signal handler shold have cleared out PKEY register to let the
    910	 * test program continue.  We now have to restore it.
    911	 */
    912	if (__read_pkey_reg() != 0)
    913#else /* arch */
    914	if (__read_pkey_reg() != shadow_pkey_reg)
    915#endif /* arch */
    916		pkey_assert(0);
    917
    918	__write_pkey_reg(shadow_pkey_reg);
    919	dprintf1("%s() set pkey_reg=%016llx to restore state after signal "
    920		       "nuked it\n", __func__, shadow_pkey_reg);
    921	last_pkey_faults = pkey_faults;
    922	last_si_pkey = -1;
    923}
    924
    925#define do_not_expect_pkey_fault(msg)	do {			\
    926	if (last_pkey_faults != pkey_faults)			\
    927		dprintf0("unexpected PKey fault: %s\n", msg);	\
    928	pkey_assert(last_pkey_faults == pkey_faults);		\
    929} while (0)
    930
    931int test_fds[10] = { -1 };
    932int nr_test_fds;
    933void __save_test_fd(int fd)
    934{
    935	pkey_assert(fd >= 0);
    936	pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
    937	test_fds[nr_test_fds] = fd;
    938	nr_test_fds++;
    939}
    940
    941int get_test_read_fd(void)
    942{
    943	int test_fd = open("/etc/passwd", O_RDONLY);
    944	__save_test_fd(test_fd);
    945	return test_fd;
    946}
    947
    948void close_test_fds(void)
    949{
    950	int i;
    951
    952	for (i = 0; i < nr_test_fds; i++) {
    953		if (test_fds[i] < 0)
    954			continue;
    955		close(test_fds[i]);
    956		test_fds[i] = -1;
    957	}
    958	nr_test_fds = 0;
    959}
    960
    961#define barrier() __asm__ __volatile__("": : :"memory")
    962__attribute__((noinline)) int read_ptr(int *ptr)
    963{
    964	/*
    965	 * Keep GCC from optimizing this away somehow
    966	 */
    967	barrier();
    968	return *ptr;
    969}
    970
    971void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
    972{
    973	int i, err;
    974	int max_nr_pkey_allocs;
    975	int alloced_pkeys[NR_PKEYS];
    976	int nr_alloced = 0;
    977	long size;
    978
    979	pkey_assert(pkey_last_malloc_record);
    980	size = pkey_last_malloc_record->size;
    981	/*
    982	 * This is a bit of a hack.  But mprotect() requires
    983	 * huge-page-aligned sizes when operating on hugetlbfs.
    984	 * So, make sure that we use something that's a multiple
    985	 * of a huge page when we can.
    986	 */
    987	if (size >= HPAGE_SIZE)
    988		size = HPAGE_SIZE;
    989
    990	/* allocate every possible key and make sure key-0 never got allocated */
    991	max_nr_pkey_allocs = NR_PKEYS;
    992	for (i = 0; i < max_nr_pkey_allocs; i++) {
    993		int new_pkey = alloc_pkey();
    994		pkey_assert(new_pkey != 0);
    995
    996		if (new_pkey < 0)
    997			break;
    998		alloced_pkeys[nr_alloced++] = new_pkey;
    999	}
   1000	/* free all the allocated keys */
   1001	for (i = 0; i < nr_alloced; i++) {
   1002		int free_ret;
   1003
   1004		if (!alloced_pkeys[i])
   1005			continue;
   1006		free_ret = sys_pkey_free(alloced_pkeys[i]);
   1007		pkey_assert(!free_ret);
   1008	}
   1009
   1010	/* attach key-0 in various modes */
   1011	err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
   1012	pkey_assert(!err);
   1013	err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
   1014	pkey_assert(!err);
   1015	err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
   1016	pkey_assert(!err);
   1017	err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
   1018	pkey_assert(!err);
   1019	err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
   1020	pkey_assert(!err);
   1021}
   1022
   1023void test_read_of_write_disabled_region(int *ptr, u16 pkey)
   1024{
   1025	int ptr_contents;
   1026
   1027	dprintf1("disabling write access to PKEY[1], doing read\n");
   1028	pkey_write_deny(pkey);
   1029	ptr_contents = read_ptr(ptr);
   1030	dprintf1("*ptr: %d\n", ptr_contents);
   1031	dprintf1("\n");
   1032}
   1033void test_read_of_access_disabled_region(int *ptr, u16 pkey)
   1034{
   1035	int ptr_contents;
   1036
   1037	dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
   1038	read_pkey_reg();
   1039	pkey_access_deny(pkey);
   1040	ptr_contents = read_ptr(ptr);
   1041	dprintf1("*ptr: %d\n", ptr_contents);
   1042	expected_pkey_fault(pkey);
   1043}
   1044
   1045void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
   1046		u16 pkey)
   1047{
   1048	int ptr_contents;
   1049
   1050	dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
   1051				pkey, ptr);
   1052	ptr_contents = read_ptr(ptr);
   1053	dprintf1("reading ptr before disabling the read : %d\n",
   1054			ptr_contents);
   1055	read_pkey_reg();
   1056	pkey_access_deny(pkey);
   1057	ptr_contents = read_ptr(ptr);
   1058	dprintf1("*ptr: %d\n", ptr_contents);
   1059	expected_pkey_fault(pkey);
   1060}
   1061
   1062void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
   1063		u16 pkey)
   1064{
   1065	*ptr = __LINE__;
   1066	dprintf1("disabling write access; after accessing the page, "
   1067		"to PKEY[%02d], doing write\n", pkey);
   1068	pkey_write_deny(pkey);
   1069	*ptr = __LINE__;
   1070	expected_pkey_fault(pkey);
   1071}
   1072
   1073void test_write_of_write_disabled_region(int *ptr, u16 pkey)
   1074{
   1075	dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
   1076	pkey_write_deny(pkey);
   1077	*ptr = __LINE__;
   1078	expected_pkey_fault(pkey);
   1079}
   1080void test_write_of_access_disabled_region(int *ptr, u16 pkey)
   1081{
   1082	dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
   1083	pkey_access_deny(pkey);
   1084	*ptr = __LINE__;
   1085	expected_pkey_fault(pkey);
   1086}
   1087
   1088void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
   1089			u16 pkey)
   1090{
   1091	*ptr = __LINE__;
   1092	dprintf1("disabling access; after accessing the page, "
   1093		" to PKEY[%02d], doing write\n", pkey);
   1094	pkey_access_deny(pkey);
   1095	*ptr = __LINE__;
   1096	expected_pkey_fault(pkey);
   1097}
   1098
   1099void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
   1100{
   1101	int ret;
   1102	int test_fd = get_test_read_fd();
   1103
   1104	dprintf1("disabling access to PKEY[%02d], "
   1105		 "having kernel read() to buffer\n", pkey);
   1106	pkey_access_deny(pkey);
   1107	ret = read(test_fd, ptr, 1);
   1108	dprintf1("read ret: %d\n", ret);
   1109	pkey_assert(ret);
   1110}
   1111void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
   1112{
   1113	int ret;
   1114	int test_fd = get_test_read_fd();
   1115
   1116	pkey_write_deny(pkey);
   1117	ret = read(test_fd, ptr, 100);
   1118	dprintf1("read ret: %d\n", ret);
   1119	if (ret < 0 && (DEBUG_LEVEL > 0))
   1120		perror("verbose read result (OK for this to be bad)");
   1121	pkey_assert(ret);
   1122}
   1123
   1124void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
   1125{
   1126	int pipe_ret, vmsplice_ret;
   1127	struct iovec iov;
   1128	int pipe_fds[2];
   1129
   1130	pipe_ret = pipe(pipe_fds);
   1131
   1132	pkey_assert(pipe_ret == 0);
   1133	dprintf1("disabling access to PKEY[%02d], "
   1134		 "having kernel vmsplice from buffer\n", pkey);
   1135	pkey_access_deny(pkey);
   1136	iov.iov_base = ptr;
   1137	iov.iov_len = PAGE_SIZE;
   1138	vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
   1139	dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
   1140	pkey_assert(vmsplice_ret == -1);
   1141
   1142	close(pipe_fds[0]);
   1143	close(pipe_fds[1]);
   1144}
   1145
   1146void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
   1147{
   1148	int ignored = 0xdada;
   1149	int futex_ret;
   1150	int some_int = __LINE__;
   1151
   1152	dprintf1("disabling write to PKEY[%02d], "
   1153		 "doing futex gunk in buffer\n", pkey);
   1154	*ptr = some_int;
   1155	pkey_write_deny(pkey);
   1156	futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
   1157			&ignored, ignored);
   1158	if (DEBUG_LEVEL > 0)
   1159		perror("futex");
   1160	dprintf1("futex() ret: %d\n", futex_ret);
   1161}
   1162
   1163/* Assumes that all pkeys other than 'pkey' are unallocated */
   1164void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
   1165{
   1166	int err;
   1167	int i;
   1168
   1169	/* Note: 0 is the default pkey, so don't mess with it */
   1170	for (i = 1; i < NR_PKEYS; i++) {
   1171		if (pkey == i)
   1172			continue;
   1173
   1174		dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
   1175		err = sys_pkey_free(i);
   1176		pkey_assert(err);
   1177
   1178		err = sys_pkey_free(i);
   1179		pkey_assert(err);
   1180
   1181		err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
   1182		pkey_assert(err);
   1183	}
   1184}
   1185
   1186/* Assumes that all pkeys other than 'pkey' are unallocated */
   1187void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
   1188{
   1189	int err;
   1190	int bad_pkey = NR_PKEYS+99;
   1191
   1192	/* pass a known-invalid pkey in: */
   1193	err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
   1194	pkey_assert(err);
   1195}
   1196
   1197void become_child(void)
   1198{
   1199	pid_t forkret;
   1200
   1201	forkret = fork();
   1202	pkey_assert(forkret >= 0);
   1203	dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
   1204
   1205	if (!forkret) {
   1206		/* in the child */
   1207		return;
   1208	}
   1209	exit(0);
   1210}
   1211
   1212/* Assumes that all pkeys other than 'pkey' are unallocated */
   1213void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
   1214{
   1215	int err;
   1216	int allocated_pkeys[NR_PKEYS] = {0};
   1217	int nr_allocated_pkeys = 0;
   1218	int i;
   1219
   1220	for (i = 0; i < NR_PKEYS*3; i++) {
   1221		int new_pkey;
   1222		dprintf1("%s() alloc loop: %d\n", __func__, i);
   1223		new_pkey = alloc_pkey();
   1224		dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
   1225				" shadow: 0x%016llx\n",
   1226				__func__, __LINE__, err, __read_pkey_reg(),
   1227				shadow_pkey_reg);
   1228		read_pkey_reg(); /* for shadow checking */
   1229		dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
   1230		if ((new_pkey == -1) && (errno == ENOSPC)) {
   1231			dprintf2("%s() failed to allocate pkey after %d tries\n",
   1232				__func__, nr_allocated_pkeys);
   1233		} else {
   1234			/*
   1235			 * Ensure the number of successes never
   1236			 * exceeds the number of keys supported
   1237			 * in the hardware.
   1238			 */
   1239			pkey_assert(nr_allocated_pkeys < NR_PKEYS);
   1240			allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
   1241		}
   1242
   1243		/*
   1244		 * Make sure that allocation state is properly
   1245		 * preserved across fork().
   1246		 */
   1247		if (i == NR_PKEYS*2)
   1248			become_child();
   1249	}
   1250
   1251	dprintf3("%s()::%d\n", __func__, __LINE__);
   1252
   1253	/*
   1254	 * On x86:
   1255	 * There are 16 pkeys supported in hardware.  Three are
   1256	 * allocated by the time we get here:
   1257	 *   1. The default key (0)
   1258	 *   2. One possibly consumed by an execute-only mapping.
   1259	 *   3. One allocated by the test code and passed in via
   1260	 *      'pkey' to this function.
   1261	 * Ensure that we can allocate at least another 13 (16-3).
   1262	 *
   1263	 * On powerpc:
   1264	 * There are either 5, 28, 29 or 32 pkeys supported in
   1265	 * hardware depending on the page size (4K or 64K) and
   1266	 * platform (powernv or powervm). Four are allocated by
   1267	 * the time we get here. These include pkey-0, pkey-1,
   1268	 * exec-only pkey and the one allocated by the test code.
   1269	 * Ensure that we can allocate the remaining.
   1270	 */
   1271	pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1));
   1272
   1273	for (i = 0; i < nr_allocated_pkeys; i++) {
   1274		err = sys_pkey_free(allocated_pkeys[i]);
   1275		pkey_assert(!err);
   1276		read_pkey_reg(); /* for shadow checking */
   1277	}
   1278}
   1279
   1280void arch_force_pkey_reg_init(void)
   1281{
   1282#if defined(__i386__) || defined(__x86_64__) /* arch */
   1283	u64 *buf;
   1284
   1285	/*
   1286	 * All keys should be allocated and set to allow reads and
   1287	 * writes, so the register should be all 0.  If not, just
   1288	 * skip the test.
   1289	 */
   1290	if (read_pkey_reg())
   1291		return;
   1292
   1293	/*
   1294	 * Just allocate an absurd about of memory rather than
   1295	 * doing the XSAVE size enumeration dance.
   1296	 */
   1297	buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
   1298
   1299	/* These __builtins require compiling with -mxsave */
   1300
   1301	/* XSAVE to build a valid buffer: */
   1302	__builtin_ia32_xsave(buf, XSTATE_PKEY);
   1303	/* Clear XSTATE_BV[PKRU]: */
   1304	buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY;
   1305	/* XRSTOR will likely get PKRU back to the init state: */
   1306	__builtin_ia32_xrstor(buf, XSTATE_PKEY);
   1307
   1308	munmap(buf, 1*MB);
   1309#endif
   1310}
   1311
   1312
   1313/*
   1314 * This is mostly useless on ppc for now.  But it will not
   1315 * hurt anything and should give some better coverage as
   1316 * a long-running test that continually checks the pkey
   1317 * register.
   1318 */
   1319void test_pkey_init_state(int *ptr, u16 pkey)
   1320{
   1321	int err;
   1322	int allocated_pkeys[NR_PKEYS] = {0};
   1323	int nr_allocated_pkeys = 0;
   1324	int i;
   1325
   1326	for (i = 0; i < NR_PKEYS; i++) {
   1327		int new_pkey = alloc_pkey();
   1328
   1329		if (new_pkey < 0)
   1330			continue;
   1331		allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
   1332	}
   1333
   1334	dprintf3("%s()::%d\n", __func__, __LINE__);
   1335
   1336	arch_force_pkey_reg_init();
   1337
   1338	/*
   1339	 * Loop for a bit, hoping to get exercise the kernel
   1340	 * context switch code.
   1341	 */
   1342	for (i = 0; i < 1000000; i++)
   1343		read_pkey_reg();
   1344
   1345	for (i = 0; i < nr_allocated_pkeys; i++) {
   1346		err = sys_pkey_free(allocated_pkeys[i]);
   1347		pkey_assert(!err);
   1348		read_pkey_reg(); /* for shadow checking */
   1349	}
   1350}
   1351
   1352/*
   1353 * pkey 0 is special.  It is allocated by default, so you do not
   1354 * have to call pkey_alloc() to use it first.  Make sure that it
   1355 * is usable.
   1356 */
   1357void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
   1358{
   1359	long size;
   1360	int prot;
   1361
   1362	assert(pkey_last_malloc_record);
   1363	size = pkey_last_malloc_record->size;
   1364	/*
   1365	 * This is a bit of a hack.  But mprotect() requires
   1366	 * huge-page-aligned sizes when operating on hugetlbfs.
   1367	 * So, make sure that we use something that's a multiple
   1368	 * of a huge page when we can.
   1369	 */
   1370	if (size >= HPAGE_SIZE)
   1371		size = HPAGE_SIZE;
   1372	prot = pkey_last_malloc_record->prot;
   1373
   1374	/* Use pkey 0 */
   1375	mprotect_pkey(ptr, size, prot, 0);
   1376
   1377	/* Make sure that we can set it back to the original pkey. */
   1378	mprotect_pkey(ptr, size, prot, pkey);
   1379}
   1380
   1381void test_ptrace_of_child(int *ptr, u16 pkey)
   1382{
   1383	__attribute__((__unused__)) int peek_result;
   1384	pid_t child_pid;
   1385	void *ignored = 0;
   1386	long ret;
   1387	int status;
   1388	/*
   1389	 * This is the "control" for our little expermient.  Make sure
   1390	 * we can always access it when ptracing.
   1391	 */
   1392	int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
   1393	int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
   1394
   1395	/*
   1396	 * Fork a child which is an exact copy of this process, of course.
   1397	 * That means we can do all of our tests via ptrace() and then plain
   1398	 * memory access and ensure they work differently.
   1399	 */
   1400	child_pid = fork_lazy_child();
   1401	dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
   1402
   1403	ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
   1404	if (ret)
   1405		perror("attach");
   1406	dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
   1407	pkey_assert(ret != -1);
   1408	ret = waitpid(child_pid, &status, WUNTRACED);
   1409	if ((ret != child_pid) || !(WIFSTOPPED(status))) {
   1410		fprintf(stderr, "weird waitpid result %ld stat %x\n",
   1411				ret, status);
   1412		pkey_assert(0);
   1413	}
   1414	dprintf2("waitpid ret: %ld\n", ret);
   1415	dprintf2("waitpid status: %d\n", status);
   1416
   1417	pkey_access_deny(pkey);
   1418	pkey_write_deny(pkey);
   1419
   1420	/* Write access, untested for now:
   1421	ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
   1422	pkey_assert(ret != -1);
   1423	dprintf1("poke at %p: %ld\n", peek_at, ret);
   1424	*/
   1425
   1426	/*
   1427	 * Try to access the pkey-protected "ptr" via ptrace:
   1428	 */
   1429	ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
   1430	/* expect it to work, without an error: */
   1431	pkey_assert(ret != -1);
   1432	/* Now access from the current task, and expect an exception: */
   1433	peek_result = read_ptr(ptr);
   1434	expected_pkey_fault(pkey);
   1435
   1436	/*
   1437	 * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
   1438	 */
   1439	ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
   1440	/* expect it to work, without an error: */
   1441	pkey_assert(ret != -1);
   1442	/* Now access from the current task, and expect NO exception: */
   1443	peek_result = read_ptr(plain_ptr);
   1444	do_not_expect_pkey_fault("read plain pointer after ptrace");
   1445
   1446	ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
   1447	pkey_assert(ret != -1);
   1448
   1449	ret = kill(child_pid, SIGKILL);
   1450	pkey_assert(ret != -1);
   1451
   1452	wait(&status);
   1453
   1454	free(plain_ptr_unaligned);
   1455}
   1456
   1457void *get_pointer_to_instructions(void)
   1458{
   1459	void *p1;
   1460
   1461	p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
   1462	dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
   1463	/* lots_o_noops_around_write should be page-aligned already */
   1464	assert(p1 == &lots_o_noops_around_write);
   1465
   1466	/* Point 'p1' at the *second* page of the function: */
   1467	p1 += PAGE_SIZE;
   1468
   1469	/*
   1470	 * Try to ensure we fault this in on next touch to ensure
   1471	 * we get an instruction fault as opposed to a data one
   1472	 */
   1473	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
   1474
   1475	return p1;
   1476}
   1477
   1478void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
   1479{
   1480	void *p1;
   1481	int scratch;
   1482	int ptr_contents;
   1483	int ret;
   1484
   1485	p1 = get_pointer_to_instructions();
   1486	lots_o_noops_around_write(&scratch);
   1487	ptr_contents = read_ptr(p1);
   1488	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
   1489
   1490	ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
   1491	pkey_assert(!ret);
   1492	pkey_access_deny(pkey);
   1493
   1494	dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
   1495
   1496	/*
   1497	 * Make sure this is an *instruction* fault
   1498	 */
   1499	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
   1500	lots_o_noops_around_write(&scratch);
   1501	do_not_expect_pkey_fault("executing on PROT_EXEC memory");
   1502	expect_fault_on_read_execonly_key(p1, pkey);
   1503}
   1504
   1505void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
   1506{
   1507	void *p1;
   1508	int scratch;
   1509	int ptr_contents;
   1510	int ret;
   1511
   1512	dprintf1("%s() start\n", __func__);
   1513
   1514	p1 = get_pointer_to_instructions();
   1515	lots_o_noops_around_write(&scratch);
   1516	ptr_contents = read_ptr(p1);
   1517	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
   1518
   1519	/* Use a *normal* mprotect(), not mprotect_pkey(): */
   1520	ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
   1521	pkey_assert(!ret);
   1522
   1523	/*
   1524	 * Reset the shadow, assuming that the above mprotect()
   1525	 * correctly changed PKRU, but to an unknown value since
   1526	 * the actual allocated pkey is unknown.
   1527	 */
   1528	shadow_pkey_reg = __read_pkey_reg();
   1529
   1530	dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
   1531
   1532	/* Make sure this is an *instruction* fault */
   1533	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
   1534	lots_o_noops_around_write(&scratch);
   1535	do_not_expect_pkey_fault("executing on PROT_EXEC memory");
   1536	expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY);
   1537
   1538	/*
   1539	 * Put the memory back to non-PROT_EXEC.  Should clear the
   1540	 * exec-only pkey off the VMA and allow it to be readable
   1541	 * again.  Go to PROT_NONE first to check for a kernel bug
   1542	 * that did not clear the pkey when doing PROT_NONE.
   1543	 */
   1544	ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
   1545	pkey_assert(!ret);
   1546
   1547	ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
   1548	pkey_assert(!ret);
   1549	ptr_contents = read_ptr(p1);
   1550	do_not_expect_pkey_fault("plain read on recently PROT_EXEC area");
   1551}
   1552
   1553void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
   1554{
   1555	int size = PAGE_SIZE;
   1556	int sret;
   1557
   1558	if (cpu_has_pkeys()) {
   1559		dprintf1("SKIP: %s: no CPU support\n", __func__);
   1560		return;
   1561	}
   1562
   1563	sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
   1564	pkey_assert(sret < 0);
   1565}
   1566
   1567void (*pkey_tests[])(int *ptr, u16 pkey) = {
   1568	test_read_of_write_disabled_region,
   1569	test_read_of_access_disabled_region,
   1570	test_read_of_access_disabled_region_with_page_already_mapped,
   1571	test_write_of_write_disabled_region,
   1572	test_write_of_write_disabled_region_with_page_already_mapped,
   1573	test_write_of_access_disabled_region,
   1574	test_write_of_access_disabled_region_with_page_already_mapped,
   1575	test_kernel_write_of_access_disabled_region,
   1576	test_kernel_write_of_write_disabled_region,
   1577	test_kernel_gup_of_access_disabled_region,
   1578	test_kernel_gup_write_to_write_disabled_region,
   1579	test_executing_on_unreadable_memory,
   1580	test_implicit_mprotect_exec_only_memory,
   1581	test_mprotect_with_pkey_0,
   1582	test_ptrace_of_child,
   1583	test_pkey_init_state,
   1584	test_pkey_syscalls_on_non_allocated_pkey,
   1585	test_pkey_syscalls_bad_args,
   1586	test_pkey_alloc_exhaust,
   1587	test_pkey_alloc_free_attach_pkey0,
   1588};
   1589
   1590void run_tests_once(void)
   1591{
   1592	int *ptr;
   1593	int prot = PROT_READ|PROT_WRITE;
   1594
   1595	for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
   1596		int pkey;
   1597		int orig_pkey_faults = pkey_faults;
   1598
   1599		dprintf1("======================\n");
   1600		dprintf1("test %d preparing...\n", test_nr);
   1601
   1602		tracing_on();
   1603		pkey = alloc_random_pkey();
   1604		dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
   1605		ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
   1606		dprintf1("test %d starting...\n", test_nr);
   1607		pkey_tests[test_nr](ptr, pkey);
   1608		dprintf1("freeing test memory: %p\n", ptr);
   1609		free_pkey_malloc(ptr);
   1610		sys_pkey_free(pkey);
   1611
   1612		dprintf1("pkey_faults: %d\n", pkey_faults);
   1613		dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults);
   1614
   1615		tracing_off();
   1616		close_test_fds();
   1617
   1618		printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
   1619		dprintf1("======================\n\n");
   1620	}
   1621	iteration_nr++;
   1622}
   1623
   1624void pkey_setup_shadow(void)
   1625{
   1626	shadow_pkey_reg = __read_pkey_reg();
   1627}
   1628
   1629int main(void)
   1630{
   1631	int nr_iterations = 22;
   1632	int pkeys_supported = is_pkeys_supported();
   1633
   1634	srand((unsigned int)time(NULL));
   1635
   1636	setup_handlers();
   1637
   1638	printf("has pkeys: %d\n", pkeys_supported);
   1639
   1640	if (!pkeys_supported) {
   1641		int size = PAGE_SIZE;
   1642		int *ptr;
   1643
   1644		printf("running PKEY tests for unsupported CPU/OS\n");
   1645
   1646		ptr  = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
   1647		assert(ptr != (void *)-1);
   1648		test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
   1649		exit(0);
   1650	}
   1651
   1652	pkey_setup_shadow();
   1653	printf("startup pkey_reg: %016llx\n", read_pkey_reg());
   1654	setup_hugetlbfs();
   1655
   1656	while (nr_iterations-- > 0)
   1657		run_tests_once();
   1658
   1659	printf("done (all tests OK)\n");
   1660	return 0;
   1661}