userfaultfd.c (45662B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Stress userfaultfd syscall. 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 * 7 * This test allocates two virtual areas and bounces the physical 8 * memory across the two virtual areas (from area_src to area_dst) 9 * using userfaultfd. 10 * 11 * There are three threads running per CPU: 12 * 13 * 1) one per-CPU thread takes a per-page pthread_mutex in a random 14 * page of the area_dst (while the physical page may still be in 15 * area_src), and increments a per-page counter in the same page, 16 * and checks its value against a verification region. 17 * 18 * 2) another per-CPU thread handles the userfaults generated by 19 * thread 1 above. userfaultfd blocking reads or poll() modes are 20 * exercised interleaved. 21 * 22 * 3) one last per-CPU thread transfers the memory in the background 23 * at maximum bandwidth (if not already transferred by thread 24 * 2). Each cpu thread takes cares of transferring a portion of the 25 * area. 26 * 27 * When all threads of type 3 completed the transfer, one bounce is 28 * complete. area_src and area_dst are then swapped. All threads are 29 * respawned and so the bounce is immediately restarted in the 30 * opposite direction. 31 * 32 * per-CPU threads 1 by triggering userfaults inside 33 * pthread_mutex_lock will also verify the atomicity of the memory 34 * transfer (UFFDIO_COPY). 35 */ 36 37#define _GNU_SOURCE 38#include <stdio.h> 39#include <errno.h> 40#include <unistd.h> 41#include <stdlib.h> 42#include <sys/types.h> 43#include <sys/stat.h> 44#include <fcntl.h> 45#include <time.h> 46#include <signal.h> 47#include <poll.h> 48#include <string.h> 49#include <linux/mman.h> 50#include <sys/mman.h> 51#include <sys/syscall.h> 52#include <sys/ioctl.h> 53#include <sys/wait.h> 54#include <pthread.h> 55#include <linux/userfaultfd.h> 56#include <setjmp.h> 57#include <stdbool.h> 58#include <assert.h> 59#include <inttypes.h> 60#include <stdint.h> 61#include <sys/random.h> 62 63#include "../kselftest.h" 64 65#ifdef __NR_userfaultfd 66 67static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; 68 69#define BOUNCE_RANDOM (1<<0) 70#define BOUNCE_RACINGFAULTS (1<<1) 71#define BOUNCE_VERIFY (1<<2) 72#define BOUNCE_POLL (1<<3) 73static int bounces; 74 75#define TEST_ANON 1 76#define TEST_HUGETLB 2 77#define TEST_SHMEM 3 78static int test_type; 79 80/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ 81#define ALARM_INTERVAL_SECS 10 82static volatile bool test_uffdio_copy_eexist = true; 83static volatile bool test_uffdio_zeropage_eexist = true; 84/* Whether to test uffd write-protection */ 85static bool test_uffdio_wp = true; 86/* Whether to test uffd minor faults */ 87static bool test_uffdio_minor = false; 88 89static bool map_shared; 90static int shm_fd; 91static int huge_fd; 92static unsigned long long *count_verify; 93static int uffd = -1; 94static int uffd_flags, finished, *pipefd; 95static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; 96static char *zeropage; 97pthread_attr_t attr; 98 99/* Userfaultfd test statistics */ 100struct uffd_stats { 101 int cpu; 102 unsigned long missing_faults; 103 unsigned long wp_faults; 104 unsigned long minor_faults; 105}; 106 107/* pthread_mutex_t starts at page offset 0 */ 108#define area_mutex(___area, ___nr) \ 109 ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) 110/* 111 * count is placed in the page after pthread_mutex_t naturally aligned 112 * to avoid non alignment faults on non-x86 archs. 113 */ 114#define area_count(___area, ___nr) \ 115 ((volatile unsigned long long *) ((unsigned long) \ 116 ((___area) + (___nr)*page_size + \ 117 sizeof(pthread_mutex_t) + \ 118 sizeof(unsigned long long) - 1) & \ 119 ~(unsigned long)(sizeof(unsigned long long) \ 120 - 1))) 121 122#define swap(a, b) \ 123 do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) 124 125const char *examples = 126 "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" 127 "./userfaultfd anon 100 99999\n\n" 128 "# Run share memory test on 1GiB region with 99 bounces:\n" 129 "./userfaultfd shmem 1000 99\n\n" 130 "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" 131 "./userfaultfd hugetlb 256 50\n\n" 132 "# Run the same hugetlb test but using shared file:\n" 133 "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n" 134 "# 10MiB-~6GiB 999 bounces anonymous test, " 135 "continue forever unless an error triggers\n" 136 "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; 137 138static void usage(void) 139{ 140 fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> " 141 "[hugetlbfs_file]\n\n"); 142 fprintf(stderr, "Supported <test type>: anon, hugetlb, " 143 "hugetlb_shared, shmem\n\n"); 144 fprintf(stderr, "Examples:\n\n"); 145 fprintf(stderr, "%s", examples); 146 exit(1); 147} 148 149#define _err(fmt, ...) \ 150 do { \ 151 int ret = errno; \ 152 fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ 153 fprintf(stderr, " (errno=%d, line=%d)\n", \ 154 ret, __LINE__); \ 155 } while (0) 156 157#define err(fmt, ...) \ 158 do { \ 159 _err(fmt, ##__VA_ARGS__); \ 160 exit(1); \ 161 } while (0) 162 163static void uffd_stats_reset(struct uffd_stats *uffd_stats, 164 unsigned long n_cpus) 165{ 166 int i; 167 168 for (i = 0; i < n_cpus; i++) { 169 uffd_stats[i].cpu = i; 170 uffd_stats[i].missing_faults = 0; 171 uffd_stats[i].wp_faults = 0; 172 uffd_stats[i].minor_faults = 0; 173 } 174} 175 176static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) 177{ 178 int i; 179 unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; 180 181 for (i = 0; i < n_cpus; i++) { 182 miss_total += stats[i].missing_faults; 183 wp_total += stats[i].wp_faults; 184 minor_total += stats[i].minor_faults; 185 } 186 187 printf("userfaults: "); 188 if (miss_total) { 189 printf("%llu missing (", miss_total); 190 for (i = 0; i < n_cpus; i++) 191 printf("%lu+", stats[i].missing_faults); 192 printf("\b) "); 193 } 194 if (wp_total) { 195 printf("%llu wp (", wp_total); 196 for (i = 0; i < n_cpus; i++) 197 printf("%lu+", stats[i].wp_faults); 198 printf("\b) "); 199 } 200 if (minor_total) { 201 printf("%llu minor (", minor_total); 202 for (i = 0; i < n_cpus; i++) 203 printf("%lu+", stats[i].minor_faults); 204 printf("\b)"); 205 } 206 printf("\n"); 207} 208 209static void anon_release_pages(char *rel_area) 210{ 211 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 212 err("madvise(MADV_DONTNEED) failed"); 213} 214 215static void anon_allocate_area(void **alloc_area) 216{ 217 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 218 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 219 if (*alloc_area == MAP_FAILED) 220 err("mmap of anonymous memory failed"); 221} 222 223static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) 224{ 225} 226 227static void hugetlb_release_pages(char *rel_area) 228{ 229 if (!map_shared) { 230 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 231 err("madvise(MADV_DONTNEED) failed"); 232 } else { 233 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 234 err("madvise(MADV_REMOVE) failed"); 235 } 236} 237 238static void hugetlb_allocate_area(void **alloc_area) 239{ 240 void *area_alias = NULL; 241 char **alloc_area_alias; 242 243 if (!map_shared) 244 *alloc_area = mmap(NULL, 245 nr_pages * page_size, 246 PROT_READ | PROT_WRITE, 247 MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | 248 (*alloc_area == area_src ? 0 : MAP_NORESERVE), 249 -1, 250 0); 251 else 252 *alloc_area = mmap(NULL, 253 nr_pages * page_size, 254 PROT_READ | PROT_WRITE, 255 MAP_SHARED | 256 (*alloc_area == area_src ? 0 : MAP_NORESERVE), 257 huge_fd, 258 *alloc_area == area_src ? 0 : nr_pages * page_size); 259 if (*alloc_area == MAP_FAILED) 260 err("mmap of hugetlbfs file failed"); 261 262 if (map_shared) { 263 area_alias = mmap(NULL, 264 nr_pages * page_size, 265 PROT_READ | PROT_WRITE, 266 MAP_SHARED, 267 huge_fd, 268 *alloc_area == area_src ? 0 : nr_pages * page_size); 269 if (area_alias == MAP_FAILED) 270 err("mmap of hugetlb file alias failed"); 271 } 272 273 if (*alloc_area == area_src) { 274 alloc_area_alias = &area_src_alias; 275 } else { 276 alloc_area_alias = &area_dst_alias; 277 } 278 if (area_alias) 279 *alloc_area_alias = area_alias; 280} 281 282static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) 283{ 284 if (!map_shared) 285 return; 286 287 *start = (unsigned long) area_dst_alias + offset; 288} 289 290static void shmem_release_pages(char *rel_area) 291{ 292 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 293 err("madvise(MADV_REMOVE) failed"); 294} 295 296static void shmem_allocate_area(void **alloc_area) 297{ 298 void *area_alias = NULL; 299 bool is_src = alloc_area == (void **)&area_src; 300 unsigned long offset = is_src ? 0 : nr_pages * page_size; 301 302 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 303 MAP_SHARED, shm_fd, offset); 304 if (*alloc_area == MAP_FAILED) 305 err("mmap of memfd failed"); 306 307 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 308 MAP_SHARED, shm_fd, offset); 309 if (area_alias == MAP_FAILED) 310 err("mmap of memfd alias failed"); 311 312 if (is_src) 313 area_src_alias = area_alias; 314 else 315 area_dst_alias = area_alias; 316} 317 318static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) 319{ 320 *start = (unsigned long)area_dst_alias + offset; 321} 322 323struct uffd_test_ops { 324 void (*allocate_area)(void **alloc_area); 325 void (*release_pages)(char *rel_area); 326 void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); 327}; 328 329static struct uffd_test_ops anon_uffd_test_ops = { 330 .allocate_area = anon_allocate_area, 331 .release_pages = anon_release_pages, 332 .alias_mapping = noop_alias_mapping, 333}; 334 335static struct uffd_test_ops shmem_uffd_test_ops = { 336 .allocate_area = shmem_allocate_area, 337 .release_pages = shmem_release_pages, 338 .alias_mapping = shmem_alias_mapping, 339}; 340 341static struct uffd_test_ops hugetlb_uffd_test_ops = { 342 .allocate_area = hugetlb_allocate_area, 343 .release_pages = hugetlb_release_pages, 344 .alias_mapping = hugetlb_alias_mapping, 345}; 346 347static struct uffd_test_ops *uffd_test_ops; 348 349static inline uint64_t uffd_minor_feature(void) 350{ 351 if (test_type == TEST_HUGETLB && map_shared) 352 return UFFD_FEATURE_MINOR_HUGETLBFS; 353 else if (test_type == TEST_SHMEM) 354 return UFFD_FEATURE_MINOR_SHMEM; 355 else 356 return 0; 357} 358 359static uint64_t get_expected_ioctls(uint64_t mode) 360{ 361 uint64_t ioctls = UFFD_API_RANGE_IOCTLS; 362 363 if (test_type == TEST_HUGETLB) 364 ioctls &= ~(1 << _UFFDIO_ZEROPAGE); 365 366 if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) 367 ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); 368 369 if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) 370 ioctls &= ~(1 << _UFFDIO_CONTINUE); 371 372 return ioctls; 373} 374 375static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) 376{ 377 uint64_t expected = get_expected_ioctls(mode); 378 uint64_t actual = ioctls & expected; 379 380 if (actual != expected) { 381 err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, 382 expected, actual); 383 } 384} 385 386static void userfaultfd_open(uint64_t *features) 387{ 388 struct uffdio_api uffdio_api; 389 390 uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); 391 if (uffd < 0) 392 err("userfaultfd syscall not available in this kernel"); 393 uffd_flags = fcntl(uffd, F_GETFD, NULL); 394 395 uffdio_api.api = UFFD_API; 396 uffdio_api.features = *features; 397 if (ioctl(uffd, UFFDIO_API, &uffdio_api)) 398 err("UFFDIO_API failed.\nPlease make sure to " 399 "run with either root or ptrace capability."); 400 if (uffdio_api.api != UFFD_API) 401 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); 402 403 *features = uffdio_api.features; 404} 405 406static inline void munmap_area(void **area) 407{ 408 if (*area) 409 if (munmap(*area, nr_pages * page_size)) 410 err("munmap"); 411 412 *area = NULL; 413} 414 415static void uffd_test_ctx_clear(void) 416{ 417 size_t i; 418 419 if (pipefd) { 420 for (i = 0; i < nr_cpus * 2; ++i) { 421 if (close(pipefd[i])) 422 err("close pipefd"); 423 } 424 free(pipefd); 425 pipefd = NULL; 426 } 427 428 if (count_verify) { 429 free(count_verify); 430 count_verify = NULL; 431 } 432 433 if (uffd != -1) { 434 if (close(uffd)) 435 err("close uffd"); 436 uffd = -1; 437 } 438 439 munmap_area((void **)&area_src); 440 munmap_area((void **)&area_src_alias); 441 munmap_area((void **)&area_dst); 442 munmap_area((void **)&area_dst_alias); 443} 444 445static void uffd_test_ctx_init(uint64_t features) 446{ 447 unsigned long nr, cpu; 448 449 uffd_test_ctx_clear(); 450 451 uffd_test_ops->allocate_area((void **)&area_src); 452 uffd_test_ops->allocate_area((void **)&area_dst); 453 454 userfaultfd_open(&features); 455 456 count_verify = malloc(nr_pages * sizeof(unsigned long long)); 457 if (!count_verify) 458 err("count_verify"); 459 460 for (nr = 0; nr < nr_pages; nr++) { 461 *area_mutex(area_src, nr) = 462 (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; 463 count_verify[nr] = *area_count(area_src, nr) = 1; 464 /* 465 * In the transition between 255 to 256, powerpc will 466 * read out of order in my_bcmp and see both bytes as 467 * zero, so leave a placeholder below always non-zero 468 * after the count, to avoid my_bcmp to trigger false 469 * positives. 470 */ 471 *(area_count(area_src, nr) + 1) = 1; 472 } 473 474 /* 475 * After initialization of area_src, we must explicitly release pages 476 * for area_dst to make sure it's fully empty. Otherwise we could have 477 * some area_dst pages be errornously initialized with zero pages, 478 * hence we could hit memory corruption later in the test. 479 * 480 * One example is when THP is globally enabled, above allocate_area() 481 * calls could have the two areas merged into a single VMA (as they 482 * will have the same VMA flags so they're mergeable). When we 483 * initialize the area_src above, it's possible that some part of 484 * area_dst could have been faulted in via one huge THP that will be 485 * shared between area_src and area_dst. It could cause some of the 486 * area_dst won't be trapped by missing userfaults. 487 * 488 * This release_pages() will guarantee even if that happened, we'll 489 * proactively split the thp and drop any accidentally initialized 490 * pages within area_dst. 491 */ 492 uffd_test_ops->release_pages(area_dst); 493 494 pipefd = malloc(sizeof(int) * nr_cpus * 2); 495 if (!pipefd) 496 err("pipefd"); 497 for (cpu = 0; cpu < nr_cpus; cpu++) 498 if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) 499 err("pipe"); 500} 501 502static int my_bcmp(char *str1, char *str2, size_t n) 503{ 504 unsigned long i; 505 for (i = 0; i < n; i++) 506 if (str1[i] != str2[i]) 507 return 1; 508 return 0; 509} 510 511static void wp_range(int ufd, __u64 start, __u64 len, bool wp) 512{ 513 struct uffdio_writeprotect prms; 514 515 /* Write protection page faults */ 516 prms.range.start = start; 517 prms.range.len = len; 518 /* Undo write-protect, do wakeup after that */ 519 prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; 520 521 if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) 522 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); 523} 524 525static void continue_range(int ufd, __u64 start, __u64 len) 526{ 527 struct uffdio_continue req; 528 int ret; 529 530 req.range.start = start; 531 req.range.len = len; 532 req.mode = 0; 533 534 if (ioctl(ufd, UFFDIO_CONTINUE, &req)) 535 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, 536 (uint64_t)start); 537 538 /* 539 * Error handling within the kernel for continue is subtly different 540 * from copy or zeropage, so it may be a source of bugs. Trigger an 541 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. 542 */ 543 req.mapped = 0; 544 ret = ioctl(ufd, UFFDIO_CONTINUE, &req); 545 if (ret >= 0 || req.mapped != -EEXIST) 546 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, 547 ret, (int64_t) req.mapped); 548} 549 550static void *locking_thread(void *arg) 551{ 552 unsigned long cpu = (unsigned long) arg; 553 unsigned long page_nr; 554 unsigned long long count; 555 556 if (!(bounces & BOUNCE_RANDOM)) { 557 page_nr = -bounces; 558 if (!(bounces & BOUNCE_RACINGFAULTS)) 559 page_nr += cpu * nr_pages_per_cpu; 560 } 561 562 while (!finished) { 563 if (bounces & BOUNCE_RANDOM) { 564 if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) 565 err("getrandom failed"); 566 } else 567 page_nr += 1; 568 page_nr %= nr_pages; 569 pthread_mutex_lock(area_mutex(area_dst, page_nr)); 570 count = *area_count(area_dst, page_nr); 571 if (count != count_verify[page_nr]) 572 err("page_nr %lu memory corruption %llu %llu", 573 page_nr, count, count_verify[page_nr]); 574 count++; 575 *area_count(area_dst, page_nr) = count_verify[page_nr] = count; 576 pthread_mutex_unlock(area_mutex(area_dst, page_nr)); 577 } 578 579 return NULL; 580} 581 582static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, 583 unsigned long offset) 584{ 585 uffd_test_ops->alias_mapping(&uffdio_copy->dst, 586 uffdio_copy->len, 587 offset); 588 if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { 589 /* real retval in ufdio_copy.copy */ 590 if (uffdio_copy->copy != -EEXIST) 591 err("UFFDIO_COPY retry error: %"PRId64, 592 (int64_t)uffdio_copy->copy); 593 } else { 594 err("UFFDIO_COPY retry unexpected: %"PRId64, 595 (int64_t)uffdio_copy->copy); 596 } 597} 598 599static void wake_range(int ufd, unsigned long addr, unsigned long len) 600{ 601 struct uffdio_range uffdio_wake; 602 603 uffdio_wake.start = addr; 604 uffdio_wake.len = len; 605 606 if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) 607 fprintf(stderr, "error waking %lu\n", 608 addr), exit(1); 609} 610 611static int __copy_page(int ufd, unsigned long offset, bool retry) 612{ 613 struct uffdio_copy uffdio_copy; 614 615 if (offset >= nr_pages * page_size) 616 err("unexpected offset %lu\n", offset); 617 uffdio_copy.dst = (unsigned long) area_dst + offset; 618 uffdio_copy.src = (unsigned long) area_src + offset; 619 uffdio_copy.len = page_size; 620 if (test_uffdio_wp) 621 uffdio_copy.mode = UFFDIO_COPY_MODE_WP; 622 else 623 uffdio_copy.mode = 0; 624 uffdio_copy.copy = 0; 625 if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { 626 /* real retval in ufdio_copy.copy */ 627 if (uffdio_copy.copy != -EEXIST) 628 err("UFFDIO_COPY error: %"PRId64, 629 (int64_t)uffdio_copy.copy); 630 wake_range(ufd, uffdio_copy.dst, page_size); 631 } else if (uffdio_copy.copy != page_size) { 632 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); 633 } else { 634 if (test_uffdio_copy_eexist && retry) { 635 test_uffdio_copy_eexist = false; 636 retry_copy_page(ufd, &uffdio_copy, offset); 637 } 638 return 1; 639 } 640 return 0; 641} 642 643static int copy_page_retry(int ufd, unsigned long offset) 644{ 645 return __copy_page(ufd, offset, true); 646} 647 648static int copy_page(int ufd, unsigned long offset) 649{ 650 return __copy_page(ufd, offset, false); 651} 652 653static int uffd_read_msg(int ufd, struct uffd_msg *msg) 654{ 655 int ret = read(uffd, msg, sizeof(*msg)); 656 657 if (ret != sizeof(*msg)) { 658 if (ret < 0) { 659 if (errno == EAGAIN || errno == EINTR) 660 return 1; 661 err("blocking read error"); 662 } else { 663 err("short read"); 664 } 665 } 666 667 return 0; 668} 669 670static void uffd_handle_page_fault(struct uffd_msg *msg, 671 struct uffd_stats *stats) 672{ 673 unsigned long offset; 674 675 if (msg->event != UFFD_EVENT_PAGEFAULT) 676 err("unexpected msg event %u", msg->event); 677 678 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { 679 /* Write protect page faults */ 680 wp_range(uffd, msg->arg.pagefault.address, page_size, false); 681 stats->wp_faults++; 682 } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { 683 uint8_t *area; 684 int b; 685 686 /* 687 * Minor page faults 688 * 689 * To prove we can modify the original range for testing 690 * purposes, we're going to bit flip this range before 691 * continuing. 692 * 693 * Note that this requires all minor page fault tests operate on 694 * area_dst (non-UFFD-registered) and area_dst_alias 695 * (UFFD-registered). 696 */ 697 698 area = (uint8_t *)(area_dst + 699 ((char *)msg->arg.pagefault.address - 700 area_dst_alias)); 701 for (b = 0; b < page_size; ++b) 702 area[b] = ~area[b]; 703 continue_range(uffd, msg->arg.pagefault.address, page_size); 704 stats->minor_faults++; 705 } else { 706 /* Missing page faults */ 707 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) 708 err("unexpected write fault"); 709 710 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; 711 offset &= ~(page_size-1); 712 713 if (copy_page(uffd, offset)) 714 stats->missing_faults++; 715 } 716} 717 718static void *uffd_poll_thread(void *arg) 719{ 720 struct uffd_stats *stats = (struct uffd_stats *)arg; 721 unsigned long cpu = stats->cpu; 722 struct pollfd pollfd[2]; 723 struct uffd_msg msg; 724 struct uffdio_register uffd_reg; 725 int ret; 726 char tmp_chr; 727 728 pollfd[0].fd = uffd; 729 pollfd[0].events = POLLIN; 730 pollfd[1].fd = pipefd[cpu*2]; 731 pollfd[1].events = POLLIN; 732 733 for (;;) { 734 ret = poll(pollfd, 2, -1); 735 if (ret <= 0) { 736 if (errno == EINTR || errno == EAGAIN) 737 continue; 738 err("poll error: %d", ret); 739 } 740 if (pollfd[1].revents & POLLIN) { 741 if (read(pollfd[1].fd, &tmp_chr, 1) != 1) 742 err("read pipefd error"); 743 break; 744 } 745 if (!(pollfd[0].revents & POLLIN)) 746 err("pollfd[0].revents %d", pollfd[0].revents); 747 if (uffd_read_msg(uffd, &msg)) 748 continue; 749 switch (msg.event) { 750 default: 751 err("unexpected msg event %u\n", msg.event); 752 break; 753 case UFFD_EVENT_PAGEFAULT: 754 uffd_handle_page_fault(&msg, stats); 755 break; 756 case UFFD_EVENT_FORK: 757 close(uffd); 758 uffd = msg.arg.fork.ufd; 759 pollfd[0].fd = uffd; 760 break; 761 case UFFD_EVENT_REMOVE: 762 uffd_reg.range.start = msg.arg.remove.start; 763 uffd_reg.range.len = msg.arg.remove.end - 764 msg.arg.remove.start; 765 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) 766 err("remove failure"); 767 break; 768 case UFFD_EVENT_REMAP: 769 area_dst = (char *)(unsigned long)msg.arg.remap.to; 770 break; 771 } 772 } 773 774 return NULL; 775} 776 777pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; 778 779static void *uffd_read_thread(void *arg) 780{ 781 struct uffd_stats *stats = (struct uffd_stats *)arg; 782 struct uffd_msg msg; 783 784 pthread_mutex_unlock(&uffd_read_mutex); 785 /* from here cancellation is ok */ 786 787 for (;;) { 788 if (uffd_read_msg(uffd, &msg)) 789 continue; 790 uffd_handle_page_fault(&msg, stats); 791 } 792 793 return NULL; 794} 795 796static void *background_thread(void *arg) 797{ 798 unsigned long cpu = (unsigned long) arg; 799 unsigned long page_nr, start_nr, mid_nr, end_nr; 800 801 start_nr = cpu * nr_pages_per_cpu; 802 end_nr = (cpu+1) * nr_pages_per_cpu; 803 mid_nr = (start_nr + end_nr) / 2; 804 805 /* Copy the first half of the pages */ 806 for (page_nr = start_nr; page_nr < mid_nr; page_nr++) 807 copy_page_retry(uffd, page_nr * page_size); 808 809 /* 810 * If we need to test uffd-wp, set it up now. Then we'll have 811 * at least the first half of the pages mapped already which 812 * can be write-protected for testing 813 */ 814 if (test_uffdio_wp) 815 wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, 816 nr_pages_per_cpu * page_size, true); 817 818 /* 819 * Continue the 2nd half of the page copying, handling write 820 * protection faults if any 821 */ 822 for (page_nr = mid_nr; page_nr < end_nr; page_nr++) 823 copy_page_retry(uffd, page_nr * page_size); 824 825 return NULL; 826} 827 828static int stress(struct uffd_stats *uffd_stats) 829{ 830 unsigned long cpu; 831 pthread_t locking_threads[nr_cpus]; 832 pthread_t uffd_threads[nr_cpus]; 833 pthread_t background_threads[nr_cpus]; 834 835 finished = 0; 836 for (cpu = 0; cpu < nr_cpus; cpu++) { 837 if (pthread_create(&locking_threads[cpu], &attr, 838 locking_thread, (void *)cpu)) 839 return 1; 840 if (bounces & BOUNCE_POLL) { 841 if (pthread_create(&uffd_threads[cpu], &attr, 842 uffd_poll_thread, 843 (void *)&uffd_stats[cpu])) 844 return 1; 845 } else { 846 if (pthread_create(&uffd_threads[cpu], &attr, 847 uffd_read_thread, 848 (void *)&uffd_stats[cpu])) 849 return 1; 850 pthread_mutex_lock(&uffd_read_mutex); 851 } 852 if (pthread_create(&background_threads[cpu], &attr, 853 background_thread, (void *)cpu)) 854 return 1; 855 } 856 for (cpu = 0; cpu < nr_cpus; cpu++) 857 if (pthread_join(background_threads[cpu], NULL)) 858 return 1; 859 860 /* 861 * Be strict and immediately zap area_src, the whole area has 862 * been transferred already by the background treads. The 863 * area_src could then be faulted in in a racy way by still 864 * running uffdio_threads reading zeropages after we zapped 865 * area_src (but they're guaranteed to get -EEXIST from 866 * UFFDIO_COPY without writing zero pages into area_dst 867 * because the background threads already completed). 868 */ 869 uffd_test_ops->release_pages(area_src); 870 871 finished = 1; 872 for (cpu = 0; cpu < nr_cpus; cpu++) 873 if (pthread_join(locking_threads[cpu], NULL)) 874 return 1; 875 876 for (cpu = 0; cpu < nr_cpus; cpu++) { 877 char c; 878 if (bounces & BOUNCE_POLL) { 879 if (write(pipefd[cpu*2+1], &c, 1) != 1) 880 err("pipefd write error"); 881 if (pthread_join(uffd_threads[cpu], 882 (void *)&uffd_stats[cpu])) 883 return 1; 884 } else { 885 if (pthread_cancel(uffd_threads[cpu])) 886 return 1; 887 if (pthread_join(uffd_threads[cpu], NULL)) 888 return 1; 889 } 890 } 891 892 return 0; 893} 894 895sigjmp_buf jbuf, *sigbuf; 896 897static void sighndl(int sig, siginfo_t *siginfo, void *ptr) 898{ 899 if (sig == SIGBUS) { 900 if (sigbuf) 901 siglongjmp(*sigbuf, 1); 902 abort(); 903 } 904} 905 906/* 907 * For non-cooperative userfaultfd test we fork() a process that will 908 * generate pagefaults, will mremap the area monitored by the 909 * userfaultfd and at last this process will release the monitored 910 * area. 911 * For the anonymous and shared memory the area is divided into two 912 * parts, the first part is accessed before mremap, and the second 913 * part is accessed after mremap. Since hugetlbfs does not support 914 * mremap, the entire monitored area is accessed in a single pass for 915 * HUGETLB_TEST. 916 * The release of the pages currently generates event for shmem and 917 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked 918 * for hugetlb. 919 * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register 920 * monitored area, generate pagefaults and test that signal is delivered. 921 * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 922 * test robustness use case - we release monitored area, fork a process 923 * that will generate pagefaults and verify signal is generated. 924 * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal 925 * feature. Using monitor thread, verify no userfault events are generated. 926 */ 927static int faulting_process(int signal_test) 928{ 929 unsigned long nr; 930 unsigned long long count; 931 unsigned long split_nr_pages; 932 unsigned long lastnr; 933 struct sigaction act; 934 unsigned long signalled = 0; 935 936 split_nr_pages = (nr_pages + 1) / 2; 937 938 if (signal_test) { 939 sigbuf = &jbuf; 940 memset(&act, 0, sizeof(act)); 941 act.sa_sigaction = sighndl; 942 act.sa_flags = SA_SIGINFO; 943 if (sigaction(SIGBUS, &act, 0)) 944 err("sigaction"); 945 lastnr = (unsigned long)-1; 946 } 947 948 for (nr = 0; nr < split_nr_pages; nr++) { 949 int steps = 1; 950 unsigned long offset = nr * page_size; 951 952 if (signal_test) { 953 if (sigsetjmp(*sigbuf, 1) != 0) { 954 if (steps == 1 && nr == lastnr) 955 err("Signal repeated"); 956 957 lastnr = nr; 958 if (signal_test == 1) { 959 if (steps == 1) { 960 /* This is a MISSING request */ 961 steps++; 962 if (copy_page(uffd, offset)) 963 signalled++; 964 } else { 965 /* This is a WP request */ 966 assert(steps == 2); 967 wp_range(uffd, 968 (__u64)area_dst + 969 offset, 970 page_size, false); 971 } 972 } else { 973 signalled++; 974 continue; 975 } 976 } 977 } 978 979 count = *area_count(area_dst, nr); 980 if (count != count_verify[nr]) 981 err("nr %lu memory corruption %llu %llu\n", 982 nr, count, count_verify[nr]); 983 /* 984 * Trigger write protection if there is by writing 985 * the same value back. 986 */ 987 *area_count(area_dst, nr) = count; 988 } 989 990 if (signal_test) 991 return signalled != split_nr_pages; 992 993 area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, 994 MREMAP_MAYMOVE | MREMAP_FIXED, area_src); 995 if (area_dst == MAP_FAILED) 996 err("mremap"); 997 /* Reset area_src since we just clobbered it */ 998 area_src = NULL; 999 1000 for (; nr < nr_pages; nr++) { 1001 count = *area_count(area_dst, nr); 1002 if (count != count_verify[nr]) { 1003 err("nr %lu memory corruption %llu %llu\n", 1004 nr, count, count_verify[nr]); 1005 } 1006 /* 1007 * Trigger write protection if there is by writing 1008 * the same value back. 1009 */ 1010 *area_count(area_dst, nr) = count; 1011 } 1012 1013 uffd_test_ops->release_pages(area_dst); 1014 1015 for (nr = 0; nr < nr_pages; nr++) 1016 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) 1017 err("nr %lu is not zero", nr); 1018 1019 return 0; 1020} 1021 1022static void retry_uffdio_zeropage(int ufd, 1023 struct uffdio_zeropage *uffdio_zeropage, 1024 unsigned long offset) 1025{ 1026 uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, 1027 uffdio_zeropage->range.len, 1028 offset); 1029 if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { 1030 if (uffdio_zeropage->zeropage != -EEXIST) 1031 err("UFFDIO_ZEROPAGE error: %"PRId64, 1032 (int64_t)uffdio_zeropage->zeropage); 1033 } else { 1034 err("UFFDIO_ZEROPAGE error: %"PRId64, 1035 (int64_t)uffdio_zeropage->zeropage); 1036 } 1037} 1038 1039static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) 1040{ 1041 struct uffdio_zeropage uffdio_zeropage; 1042 int ret; 1043 bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); 1044 __s64 res; 1045 1046 if (offset >= nr_pages * page_size) 1047 err("unexpected offset %lu", offset); 1048 uffdio_zeropage.range.start = (unsigned long) area_dst + offset; 1049 uffdio_zeropage.range.len = page_size; 1050 uffdio_zeropage.mode = 0; 1051 ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); 1052 res = uffdio_zeropage.zeropage; 1053 if (ret) { 1054 /* real retval in ufdio_zeropage.zeropage */ 1055 if (has_zeropage) 1056 err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); 1057 else if (res != -EINVAL) 1058 err("UFFDIO_ZEROPAGE not -EINVAL"); 1059 } else if (has_zeropage) { 1060 if (res != page_size) { 1061 err("UFFDIO_ZEROPAGE unexpected size"); 1062 } else { 1063 if (test_uffdio_zeropage_eexist && retry) { 1064 test_uffdio_zeropage_eexist = false; 1065 retry_uffdio_zeropage(ufd, &uffdio_zeropage, 1066 offset); 1067 } 1068 return 1; 1069 } 1070 } else 1071 err("UFFDIO_ZEROPAGE succeeded"); 1072 1073 return 0; 1074} 1075 1076static int uffdio_zeropage(int ufd, unsigned long offset) 1077{ 1078 return __uffdio_zeropage(ufd, offset, false); 1079} 1080 1081/* exercise UFFDIO_ZEROPAGE */ 1082static int userfaultfd_zeropage_test(void) 1083{ 1084 struct uffdio_register uffdio_register; 1085 1086 printf("testing UFFDIO_ZEROPAGE: "); 1087 fflush(stdout); 1088 1089 uffd_test_ctx_init(0); 1090 1091 uffdio_register.range.start = (unsigned long) area_dst; 1092 uffdio_register.range.len = nr_pages * page_size; 1093 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1094 if (test_uffdio_wp) 1095 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1096 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1097 err("register failure"); 1098 1099 assert_expected_ioctls_present( 1100 uffdio_register.mode, uffdio_register.ioctls); 1101 1102 if (uffdio_zeropage(uffd, 0)) 1103 if (my_bcmp(area_dst, zeropage, page_size)) 1104 err("zeropage is not zero"); 1105 1106 printf("done.\n"); 1107 return 0; 1108} 1109 1110static int userfaultfd_events_test(void) 1111{ 1112 struct uffdio_register uffdio_register; 1113 pthread_t uffd_mon; 1114 int err, features; 1115 pid_t pid; 1116 char c; 1117 struct uffd_stats stats = { 0 }; 1118 1119 printf("testing events (fork, remap, remove): "); 1120 fflush(stdout); 1121 1122 features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | 1123 UFFD_FEATURE_EVENT_REMOVE; 1124 uffd_test_ctx_init(features); 1125 1126 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1127 1128 uffdio_register.range.start = (unsigned long) area_dst; 1129 uffdio_register.range.len = nr_pages * page_size; 1130 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1131 if (test_uffdio_wp) 1132 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1133 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1134 err("register failure"); 1135 1136 assert_expected_ioctls_present( 1137 uffdio_register.mode, uffdio_register.ioctls); 1138 1139 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1140 err("uffd_poll_thread create"); 1141 1142 pid = fork(); 1143 if (pid < 0) 1144 err("fork"); 1145 1146 if (!pid) 1147 exit(faulting_process(0)); 1148 1149 waitpid(pid, &err, 0); 1150 if (err) 1151 err("faulting process failed"); 1152 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1153 err("pipe write"); 1154 if (pthread_join(uffd_mon, NULL)) 1155 return 1; 1156 1157 uffd_stats_report(&stats, 1); 1158 1159 return stats.missing_faults != nr_pages; 1160} 1161 1162static int userfaultfd_sig_test(void) 1163{ 1164 struct uffdio_register uffdio_register; 1165 unsigned long userfaults; 1166 pthread_t uffd_mon; 1167 int err, features; 1168 pid_t pid; 1169 char c; 1170 struct uffd_stats stats = { 0 }; 1171 1172 printf("testing signal delivery: "); 1173 fflush(stdout); 1174 1175 features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; 1176 uffd_test_ctx_init(features); 1177 1178 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1179 1180 uffdio_register.range.start = (unsigned long) area_dst; 1181 uffdio_register.range.len = nr_pages * page_size; 1182 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1183 if (test_uffdio_wp) 1184 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1185 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1186 err("register failure"); 1187 1188 assert_expected_ioctls_present( 1189 uffdio_register.mode, uffdio_register.ioctls); 1190 1191 if (faulting_process(1)) 1192 err("faulting process failed"); 1193 1194 uffd_test_ops->release_pages(area_dst); 1195 1196 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1197 err("uffd_poll_thread create"); 1198 1199 pid = fork(); 1200 if (pid < 0) 1201 err("fork"); 1202 1203 if (!pid) 1204 exit(faulting_process(2)); 1205 1206 waitpid(pid, &err, 0); 1207 if (err) 1208 err("faulting process failed"); 1209 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1210 err("pipe write"); 1211 if (pthread_join(uffd_mon, (void **)&userfaults)) 1212 return 1; 1213 1214 printf("done.\n"); 1215 if (userfaults) 1216 err("Signal test failed, userfaults: %ld", userfaults); 1217 1218 return userfaults != 0; 1219} 1220 1221static int userfaultfd_minor_test(void) 1222{ 1223 struct uffdio_register uffdio_register; 1224 unsigned long p; 1225 pthread_t uffd_mon; 1226 uint8_t expected_byte; 1227 void *expected_page; 1228 char c; 1229 struct uffd_stats stats = { 0 }; 1230 1231 if (!test_uffdio_minor) 1232 return 0; 1233 1234 printf("testing minor faults: "); 1235 fflush(stdout); 1236 1237 uffd_test_ctx_init(uffd_minor_feature()); 1238 1239 uffdio_register.range.start = (unsigned long)area_dst_alias; 1240 uffdio_register.range.len = nr_pages * page_size; 1241 uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; 1242 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1243 err("register failure"); 1244 1245 assert_expected_ioctls_present( 1246 uffdio_register.mode, uffdio_register.ioctls); 1247 1248 /* 1249 * After registering with UFFD, populate the non-UFFD-registered side of 1250 * the shared mapping. This should *not* trigger any UFFD minor faults. 1251 */ 1252 for (p = 0; p < nr_pages; ++p) { 1253 memset(area_dst + (p * page_size), p % ((uint8_t)-1), 1254 page_size); 1255 } 1256 1257 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) 1258 err("uffd_poll_thread create"); 1259 1260 /* 1261 * Read each of the pages back using the UFFD-registered mapping. We 1262 * expect that the first time we touch a page, it will result in a minor 1263 * fault. uffd_poll_thread will resolve the fault by bit-flipping the 1264 * page's contents, and then issuing a CONTINUE ioctl. 1265 */ 1266 1267 if (posix_memalign(&expected_page, page_size, page_size)) 1268 err("out of memory"); 1269 1270 for (p = 0; p < nr_pages; ++p) { 1271 expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); 1272 memset(expected_page, expected_byte, page_size); 1273 if (my_bcmp(expected_page, area_dst_alias + (p * page_size), 1274 page_size)) 1275 err("unexpected page contents after minor fault"); 1276 } 1277 1278 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) 1279 err("pipe write"); 1280 if (pthread_join(uffd_mon, NULL)) 1281 return 1; 1282 1283 uffd_stats_report(&stats, 1); 1284 1285 return stats.missing_faults != 0 || stats.minor_faults != nr_pages; 1286} 1287 1288#define BIT_ULL(nr) (1ULL << (nr)) 1289#define PM_SOFT_DIRTY BIT_ULL(55) 1290#define PM_MMAP_EXCLUSIVE BIT_ULL(56) 1291#define PM_UFFD_WP BIT_ULL(57) 1292#define PM_FILE BIT_ULL(61) 1293#define PM_SWAP BIT_ULL(62) 1294#define PM_PRESENT BIT_ULL(63) 1295 1296static int pagemap_open(void) 1297{ 1298 int fd = open("/proc/self/pagemap", O_RDONLY); 1299 1300 if (fd < 0) 1301 err("open pagemap"); 1302 1303 return fd; 1304} 1305 1306static uint64_t pagemap_read_vaddr(int fd, void *vaddr) 1307{ 1308 uint64_t value; 1309 int ret; 1310 1311 ret = pread(fd, &value, sizeof(uint64_t), 1312 ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); 1313 if (ret != sizeof(uint64_t)) 1314 err("pread() on pagemap failed"); 1315 1316 return value; 1317} 1318 1319/* This macro let __LINE__ works in err() */ 1320#define pagemap_check_wp(value, wp) do { \ 1321 if (!!(value & PM_UFFD_WP) != wp) \ 1322 err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ 1323 } while (0) 1324 1325static int pagemap_test_fork(bool present) 1326{ 1327 pid_t child = fork(); 1328 uint64_t value; 1329 int fd, result; 1330 1331 if (!child) { 1332 /* Open the pagemap fd of the child itself */ 1333 fd = pagemap_open(); 1334 value = pagemap_read_vaddr(fd, area_dst); 1335 /* 1336 * After fork() uffd-wp bit should be gone as long as we're 1337 * without UFFD_FEATURE_EVENT_FORK 1338 */ 1339 pagemap_check_wp(value, false); 1340 /* Succeed */ 1341 exit(0); 1342 } 1343 waitpid(child, &result, 0); 1344 return result; 1345} 1346 1347static void userfaultfd_pagemap_test(unsigned int test_pgsize) 1348{ 1349 struct uffdio_register uffdio_register; 1350 int pagemap_fd; 1351 uint64_t value; 1352 1353 /* Pagemap tests uffd-wp only */ 1354 if (!test_uffdio_wp) 1355 return; 1356 1357 /* Not enough memory to test this page size */ 1358 if (test_pgsize > nr_pages * page_size) 1359 return; 1360 1361 printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); 1362 /* Flush so it doesn't flush twice in parent/child later */ 1363 fflush(stdout); 1364 1365 uffd_test_ctx_init(0); 1366 1367 if (test_pgsize > page_size) { 1368 /* This is a thp test */ 1369 if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) 1370 err("madvise(MADV_HUGEPAGE) failed"); 1371 } else if (test_pgsize == page_size) { 1372 /* This is normal page test; force no thp */ 1373 if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) 1374 err("madvise(MADV_NOHUGEPAGE) failed"); 1375 } 1376 1377 uffdio_register.range.start = (unsigned long) area_dst; 1378 uffdio_register.range.len = nr_pages * page_size; 1379 uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; 1380 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1381 err("register failed"); 1382 1383 pagemap_fd = pagemap_open(); 1384 1385 /* Touch the page */ 1386 *area_dst = 1; 1387 wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); 1388 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1389 pagemap_check_wp(value, true); 1390 /* Make sure uffd-wp bit dropped when fork */ 1391 if (pagemap_test_fork(true)) 1392 err("Detected stall uffd-wp bit in child"); 1393 1394 /* Exclusive required or PAGEOUT won't work */ 1395 if (!(value & PM_MMAP_EXCLUSIVE)) 1396 err("multiple mapping detected: 0x%"PRIx64, value); 1397 1398 if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) 1399 err("madvise(MADV_PAGEOUT) failed"); 1400 1401 /* Uffd-wp should persist even swapped out */ 1402 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1403 pagemap_check_wp(value, true); 1404 /* Make sure uffd-wp bit dropped when fork */ 1405 if (pagemap_test_fork(false)) 1406 err("Detected stall uffd-wp bit in child"); 1407 1408 /* Unprotect; this tests swap pte modifications */ 1409 wp_range(uffd, (uint64_t)area_dst, page_size, false); 1410 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1411 pagemap_check_wp(value, false); 1412 1413 /* Fault in the page from disk */ 1414 *area_dst = 2; 1415 value = pagemap_read_vaddr(pagemap_fd, area_dst); 1416 pagemap_check_wp(value, false); 1417 1418 close(pagemap_fd); 1419 printf("done\n"); 1420} 1421 1422static int userfaultfd_stress(void) 1423{ 1424 void *area; 1425 unsigned long nr; 1426 struct uffdio_register uffdio_register; 1427 struct uffd_stats uffd_stats[nr_cpus]; 1428 1429 uffd_test_ctx_init(0); 1430 1431 if (posix_memalign(&area, page_size, page_size)) 1432 err("out of memory"); 1433 zeropage = area; 1434 bzero(zeropage, page_size); 1435 1436 pthread_mutex_lock(&uffd_read_mutex); 1437 1438 pthread_attr_init(&attr); 1439 pthread_attr_setstacksize(&attr, 16*1024*1024); 1440 1441 while (bounces--) { 1442 printf("bounces: %d, mode:", bounces); 1443 if (bounces & BOUNCE_RANDOM) 1444 printf(" rnd"); 1445 if (bounces & BOUNCE_RACINGFAULTS) 1446 printf(" racing"); 1447 if (bounces & BOUNCE_VERIFY) 1448 printf(" ver"); 1449 if (bounces & BOUNCE_POLL) 1450 printf(" poll"); 1451 else 1452 printf(" read"); 1453 printf(", "); 1454 fflush(stdout); 1455 1456 if (bounces & BOUNCE_POLL) 1457 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); 1458 else 1459 fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); 1460 1461 /* register */ 1462 uffdio_register.range.start = (unsigned long) area_dst; 1463 uffdio_register.range.len = nr_pages * page_size; 1464 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 1465 if (test_uffdio_wp) 1466 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; 1467 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1468 err("register failure"); 1469 assert_expected_ioctls_present( 1470 uffdio_register.mode, uffdio_register.ioctls); 1471 1472 if (area_dst_alias) { 1473 uffdio_register.range.start = (unsigned long) 1474 area_dst_alias; 1475 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) 1476 err("register failure alias"); 1477 } 1478 1479 /* 1480 * The madvise done previously isn't enough: some 1481 * uffd_thread could have read userfaults (one of 1482 * those already resolved by the background thread) 1483 * and it may be in the process of calling 1484 * UFFDIO_COPY. UFFDIO_COPY will read the zapped 1485 * area_src and it would map a zero page in it (of 1486 * course such a UFFDIO_COPY is perfectly safe as it'd 1487 * return -EEXIST). The problem comes at the next 1488 * bounce though: that racing UFFDIO_COPY would 1489 * generate zeropages in the area_src, so invalidating 1490 * the previous MADV_DONTNEED. Without this additional 1491 * MADV_DONTNEED those zeropages leftovers in the 1492 * area_src would lead to -EEXIST failure during the 1493 * next bounce, effectively leaving a zeropage in the 1494 * area_dst. 1495 * 1496 * Try to comment this out madvise to see the memory 1497 * corruption being caught pretty quick. 1498 * 1499 * khugepaged is also inhibited to collapse THP after 1500 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's 1501 * required to MADV_DONTNEED here. 1502 */ 1503 uffd_test_ops->release_pages(area_dst); 1504 1505 uffd_stats_reset(uffd_stats, nr_cpus); 1506 1507 /* bounce pass */ 1508 if (stress(uffd_stats)) 1509 return 1; 1510 1511 /* Clear all the write protections if there is any */ 1512 if (test_uffdio_wp) 1513 wp_range(uffd, (unsigned long)area_dst, 1514 nr_pages * page_size, false); 1515 1516 /* unregister */ 1517 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) 1518 err("unregister failure"); 1519 if (area_dst_alias) { 1520 uffdio_register.range.start = (unsigned long) area_dst; 1521 if (ioctl(uffd, UFFDIO_UNREGISTER, 1522 &uffdio_register.range)) 1523 err("unregister failure alias"); 1524 } 1525 1526 /* verification */ 1527 if (bounces & BOUNCE_VERIFY) 1528 for (nr = 0; nr < nr_pages; nr++) 1529 if (*area_count(area_dst, nr) != count_verify[nr]) 1530 err("error area_count %llu %llu %lu\n", 1531 *area_count(area_src, nr), 1532 count_verify[nr], nr); 1533 1534 /* prepare next bounce */ 1535 swap(area_src, area_dst); 1536 1537 swap(area_src_alias, area_dst_alias); 1538 1539 uffd_stats_report(uffd_stats, nr_cpus); 1540 } 1541 1542 if (test_type == TEST_ANON) { 1543 /* 1544 * shmem/hugetlb won't be able to run since they have different 1545 * behavior on fork() (file-backed memory normally drops ptes 1546 * directly when fork), meanwhile the pagemap test will verify 1547 * pgtable entry of fork()ed child. 1548 */ 1549 userfaultfd_pagemap_test(page_size); 1550 /* 1551 * Hard-code for x86_64 for now for 2M THP, as x86_64 is 1552 * currently the only one that supports uffd-wp 1553 */ 1554 userfaultfd_pagemap_test(page_size * 512); 1555 } 1556 1557 return userfaultfd_zeropage_test() || userfaultfd_sig_test() 1558 || userfaultfd_events_test() || userfaultfd_minor_test(); 1559} 1560 1561/* 1562 * Copied from mlock2-tests.c 1563 */ 1564unsigned long default_huge_page_size(void) 1565{ 1566 unsigned long hps = 0; 1567 char *line = NULL; 1568 size_t linelen = 0; 1569 FILE *f = fopen("/proc/meminfo", "r"); 1570 1571 if (!f) 1572 return 0; 1573 while (getline(&line, &linelen, f) > 0) { 1574 if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { 1575 hps <<= 10; 1576 break; 1577 } 1578 } 1579 1580 free(line); 1581 fclose(f); 1582 return hps; 1583} 1584 1585static void set_test_type(const char *type) 1586{ 1587 uint64_t features = UFFD_API_FEATURES; 1588 1589 if (!strcmp(type, "anon")) { 1590 test_type = TEST_ANON; 1591 uffd_test_ops = &anon_uffd_test_ops; 1592 } else if (!strcmp(type, "hugetlb")) { 1593 test_type = TEST_HUGETLB; 1594 uffd_test_ops = &hugetlb_uffd_test_ops; 1595 } else if (!strcmp(type, "hugetlb_shared")) { 1596 map_shared = true; 1597 test_type = TEST_HUGETLB; 1598 uffd_test_ops = &hugetlb_uffd_test_ops; 1599 /* Minor faults require shared hugetlb; only enable here. */ 1600 test_uffdio_minor = true; 1601 } else if (!strcmp(type, "shmem")) { 1602 map_shared = true; 1603 test_type = TEST_SHMEM; 1604 uffd_test_ops = &shmem_uffd_test_ops; 1605 test_uffdio_minor = true; 1606 } else { 1607 err("Unknown test type: %s", type); 1608 } 1609 1610 if (test_type == TEST_HUGETLB) 1611 page_size = default_huge_page_size(); 1612 else 1613 page_size = sysconf(_SC_PAGE_SIZE); 1614 1615 if (!page_size) 1616 err("Unable to determine page size"); 1617 if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 1618 > page_size) 1619 err("Impossible to run this test"); 1620 1621 /* 1622 * Whether we can test certain features depends not just on test type, 1623 * but also on whether or not this particular kernel supports the 1624 * feature. 1625 */ 1626 1627 userfaultfd_open(&features); 1628 1629 test_uffdio_wp = test_uffdio_wp && 1630 (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); 1631 test_uffdio_minor = test_uffdio_minor && 1632 (features & uffd_minor_feature()); 1633 1634 close(uffd); 1635 uffd = -1; 1636} 1637 1638static void sigalrm(int sig) 1639{ 1640 if (sig != SIGALRM) 1641 abort(); 1642 test_uffdio_copy_eexist = true; 1643 test_uffdio_zeropage_eexist = true; 1644 alarm(ALARM_INTERVAL_SECS); 1645} 1646 1647int main(int argc, char **argv) 1648{ 1649 if (argc < 4) 1650 usage(); 1651 1652 if (signal(SIGALRM, sigalrm) == SIG_ERR) 1653 err("failed to arm SIGALRM"); 1654 alarm(ALARM_INTERVAL_SECS); 1655 1656 set_test_type(argv[1]); 1657 1658 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 1659 nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / 1660 nr_cpus; 1661 if (!nr_pages_per_cpu) { 1662 _err("invalid MiB"); 1663 usage(); 1664 } 1665 1666 bounces = atoi(argv[3]); 1667 if (bounces <= 0) { 1668 _err("invalid bounces"); 1669 usage(); 1670 } 1671 nr_pages = nr_pages_per_cpu * nr_cpus; 1672 1673 if (test_type == TEST_HUGETLB && map_shared) { 1674 if (argc < 5) 1675 usage(); 1676 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); 1677 if (huge_fd < 0) 1678 err("Open of %s failed", argv[4]); 1679 if (ftruncate(huge_fd, 0)) 1680 err("ftruncate %s to size 0 failed", argv[4]); 1681 } else if (test_type == TEST_SHMEM) { 1682 shm_fd = memfd_create(argv[0], 0); 1683 if (shm_fd < 0) 1684 err("memfd_create"); 1685 if (ftruncate(shm_fd, nr_pages * page_size * 2)) 1686 err("ftruncate"); 1687 if (fallocate(shm_fd, 1688 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 1689 nr_pages * page_size * 2)) 1690 err("fallocate"); 1691 } 1692 printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", 1693 nr_pages, nr_pages_per_cpu); 1694 return userfaultfd_stress(); 1695} 1696 1697#else /* __NR_userfaultfd */ 1698 1699#warning "missing __NR_userfaultfd definition" 1700 1701int main(void) 1702{ 1703 printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); 1704 return KSFT_SKIP; 1705} 1706 1707#endif /* __NR_userfaultfd */