userfaultfd.c (9437B)
1/* 2 * Linux UFFD-WP support 3 * 4 * Copyright Virtuozzo GmbH, 2020 5 * 6 * Authors: 7 * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or 10 * later. See the COPYING file in the top-level directory. 11 */ 12 13#include "qemu/osdep.h" 14#include "qemu/bitops.h" 15#include "qemu/error-report.h" 16#include "qemu/userfaultfd.h" 17#include "trace.h" 18#include <poll.h> 19#include <sys/syscall.h> 20#include <sys/ioctl.h> 21 22/** 23 * uffd_query_features: query UFFD features 24 * 25 * Returns: 0 on success, negative value in case of an error 26 * 27 * @features: parameter to receive 'uffdio_api.features' 28 */ 29int uffd_query_features(uint64_t *features) 30{ 31 int uffd_fd; 32 struct uffdio_api api_struct = { 0 }; 33 int ret = -1; 34 35 uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC); 36 if (uffd_fd < 0) { 37 trace_uffd_query_features_nosys(errno); 38 return -1; 39 } 40 41 api_struct.api = UFFD_API; 42 api_struct.features = 0; 43 44 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { 45 trace_uffd_query_features_api_failed(errno); 46 goto out; 47 } 48 *features = api_struct.features; 49 ret = 0; 50 51out: 52 close(uffd_fd); 53 return ret; 54} 55 56/** 57 * uffd_create_fd: create UFFD file descriptor 58 * 59 * Returns non-negative file descriptor or negative value in case of an error 60 * 61 * @features: UFFD features to request 62 * @non_blocking: create UFFD file descriptor for non-blocking operation 63 */ 64int uffd_create_fd(uint64_t features, bool non_blocking) 65{ 66 int uffd_fd; 67 int flags; 68 struct uffdio_api api_struct = { 0 }; 69 uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); 70 71 flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); 72 uffd_fd = syscall(__NR_userfaultfd, flags); 73 if (uffd_fd < 0) { 74 trace_uffd_create_fd_nosys(errno); 75 return -1; 76 } 77 78 api_struct.api = UFFD_API; 79 api_struct.features = features; 80 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { 81 trace_uffd_create_fd_api_failed(errno); 82 goto fail; 83 } 84 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { 85 trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls); 86 goto fail; 87 } 88 89 return uffd_fd; 90 91fail: 92 close(uffd_fd); 93 return -1; 94} 95 96/** 97 * uffd_close_fd: close UFFD file descriptor 98 * 99 * @uffd_fd: UFFD file descriptor 100 */ 101void uffd_close_fd(int uffd_fd) 102{ 103 assert(uffd_fd >= 0); 104 close(uffd_fd); 105} 106 107/** 108 * uffd_register_memory: register memory range via UFFD-IO 109 * 110 * Returns 0 in case of success, negative value in case of an error 111 * 112 * @uffd_fd: UFFD file descriptor 113 * @addr: base address of memory range 114 * @length: length of memory range 115 * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...) 116 * @ioctls: optional pointer to receive supported IOCTL mask 117 */ 118int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, 119 uint64_t mode, uint64_t *ioctls) 120{ 121 struct uffdio_register uffd_register; 122 123 uffd_register.range.start = (uintptr_t) addr; 124 uffd_register.range.len = length; 125 uffd_register.mode = mode; 126 127 if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) { 128 trace_uffd_register_memory_failed(addr, length, mode, errno); 129 return -1; 130 } 131 if (ioctls) { 132 *ioctls = uffd_register.ioctls; 133 } 134 135 return 0; 136} 137 138/** 139 * uffd_unregister_memory: un-register memory range with UFFD-IO 140 * 141 * Returns 0 in case of success, negative value in case of an error 142 * 143 * @uffd_fd: UFFD file descriptor 144 * @addr: base address of memory range 145 * @length: length of memory range 146 */ 147int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length) 148{ 149 struct uffdio_range uffd_range; 150 151 uffd_range.start = (uintptr_t) addr; 152 uffd_range.len = length; 153 154 if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) { 155 trace_uffd_unregister_memory_failed(addr, length, errno); 156 return -1; 157 } 158 159 return 0; 160} 161 162/** 163 * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO 164 * 165 * Returns 0 on success, negative value in case of error 166 * 167 * @uffd_fd: UFFD file descriptor 168 * @addr: base address of memory range 169 * @length: length of memory range 170 * @wp: write-protect/unprotect 171 * @dont_wake: do not wake threads waiting on wr-protected page 172 */ 173int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, 174 bool wp, bool dont_wake) 175{ 176 struct uffdio_writeprotect uffd_writeprotect; 177 178 uffd_writeprotect.range.start = (uintptr_t) addr; 179 uffd_writeprotect.range.len = length; 180 if (!wp && dont_wake) { 181 /* DONTWAKE is meaningful only on protection release */ 182 uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE; 183 } else { 184 uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0); 185 } 186 187 if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { 188 error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64 189 " mode=%" PRIx64 " errno=%i", addr, length, 190 (uint64_t) uffd_writeprotect.mode, errno); 191 return -1; 192 } 193 194 return 0; 195} 196 197/** 198 * uffd_copy_page: copy range of pages to destination via UFFD-IO 199 * 200 * Copy range of source pages to the destination to resolve 201 * missing page fault somewhere in the destination range. 202 * 203 * Returns 0 on success, negative value in case of an error 204 * 205 * @uffd_fd: UFFD file descriptor 206 * @dst_addr: destination base address 207 * @src_addr: source base address 208 * @length: length of the range to copy 209 * @dont_wake: do not wake threads waiting on missing page 210 */ 211int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, 212 uint64_t length, bool dont_wake) 213{ 214 struct uffdio_copy uffd_copy; 215 216 uffd_copy.dst = (uintptr_t) dst_addr; 217 uffd_copy.src = (uintptr_t) src_addr; 218 uffd_copy.len = length; 219 uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0; 220 221 if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) { 222 error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64 223 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr, 224 length, (uint64_t) uffd_copy.mode, errno); 225 return -1; 226 } 227 228 return 0; 229} 230 231/** 232 * uffd_zero_page: fill range of pages with zeroes via UFFD-IO 233 * 234 * Fill range pages with zeroes to resolve missing page fault within the range. 235 * 236 * Returns 0 on success, negative value in case of an error 237 * 238 * @uffd_fd: UFFD file descriptor 239 * @addr: base address 240 * @length: length of the range to fill with zeroes 241 * @dont_wake: do not wake threads waiting on missing page 242 */ 243int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake) 244{ 245 struct uffdio_zeropage uffd_zeropage; 246 247 uffd_zeropage.range.start = (uintptr_t) addr; 248 uffd_zeropage.range.len = length; 249 uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0; 250 251 if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) { 252 error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64 253 " mode=%" PRIx64 " errno=%i", addr, length, 254 (uint64_t) uffd_zeropage.mode, errno); 255 return -1; 256 } 257 258 return 0; 259} 260 261/** 262 * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution 263 * 264 * Wake up threads waiting on any page/pages from the designated range. 265 * The main use case is when during some period, page faults are resolved 266 * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits 267 * for the whole memory range are satisfied in a single call to uffd_wakeup(). 268 * 269 * Returns 0 on success, negative value in case of an error 270 * 271 * @uffd_fd: UFFD file descriptor 272 * @addr: base address 273 * @length: length of the range 274 */ 275int uffd_wakeup(int uffd_fd, void *addr, uint64_t length) 276{ 277 struct uffdio_range uffd_range; 278 279 uffd_range.start = (uintptr_t) addr; 280 uffd_range.len = length; 281 282 if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) { 283 error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i", 284 addr, length, errno); 285 return -1; 286 } 287 288 return 0; 289} 290 291/** 292 * uffd_read_events: read pending UFFD events 293 * 294 * Returns number of fetched messages, 0 if non is available or 295 * negative value in case of an error 296 * 297 * @uffd_fd: UFFD file descriptor 298 * @msgs: pointer to message buffer 299 * @count: number of messages that can fit in the buffer 300 */ 301int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) 302{ 303 ssize_t res; 304 do { 305 res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg)); 306 } while (res < 0 && errno == EINTR); 307 308 if ((res < 0 && errno == EAGAIN)) { 309 return 0; 310 } 311 if (res < 0) { 312 error_report("uffd_read_events() failed: errno=%i", errno); 313 return -1; 314 } 315 316 return (int) (res / sizeof(struct uffd_msg)); 317} 318 319/** 320 * uffd_poll_events: poll UFFD file descriptor for read 321 * 322 * Returns true if events are available for read, false otherwise 323 * 324 * @uffd_fd: UFFD file descriptor 325 * @tmo: timeout value 326 */ 327bool uffd_poll_events(int uffd_fd, int tmo) 328{ 329 int res; 330 struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; 331 332 do { 333 res = poll(&poll_fd, 1, tmo); 334 } while (res < 0 && errno == EINTR); 335 336 if (res == 0) { 337 return false; 338 } 339 if (res < 0) { 340 error_report("uffd_poll_events() failed: errno=%i", errno); 341 return false; 342 } 343 344 return (poll_fd.revents & POLLIN) != 0; 345}