passthrough_ll.c (109004B)
1/* 2 * FUSE: Filesystem in Userspace 3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> 4 * 5 * This program can be distributed under the terms of the GNU GPLv2. 6 * See the file COPYING. 7 */ 8 9/* 10 * 11 * This file system mirrors the existing file system hierarchy of the 12 * system, starting at the root file system. This is implemented by 13 * just "passing through" all requests to the corresponding user-space 14 * libc functions. In contrast to passthrough.c and passthrough_fh.c, 15 * this implementation uses the low-level API. Its performance should 16 * be the least bad among the three, but many operations are not 17 * implemented. In particular, it is not possible to remove files (or 18 * directories) because the code necessary to defer actual removal 19 * until the file is not opened anymore would make the example much 20 * more complicated. 21 * 22 * When writeback caching is enabled (-o writeback mount option), it 23 * is only possible to write to files for which the mounting user has 24 * read permissions. This is because the writeback cache requires the 25 * kernel to be able to issue read requests for all files (which the 26 * passthrough filesystem cannot satisfy if it can't read the file in 27 * the underlying filesystem). 28 * 29 * Compile with: 30 * 31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o 32 * passthrough_ll 33 * 34 * ## Source code ## 35 * \include passthrough_ll.c 36 */ 37 38#include "qemu/osdep.h" 39#include "qemu/timer.h" 40#include "qemu-version.h" 41#include "qemu-common.h" 42#include "fuse_virtio.h" 43#include "fuse_log.h" 44#include "fuse_lowlevel.h" 45#include "standard-headers/linux/fuse.h" 46#include <cap-ng.h> 47#include <dirent.h> 48#include <pthread.h> 49#include <sys/file.h> 50#include <sys/mount.h> 51#include <sys/prctl.h> 52#include <sys/resource.h> 53#include <sys/syscall.h> 54#include <sys/wait.h> 55#include <sys/xattr.h> 56#include <syslog.h> 57 58#include "qemu/cutils.h" 59#include "passthrough_helpers.h" 60#include "passthrough_seccomp.h" 61 62/* Keep track of inode posix locks for each owner. */ 63struct lo_inode_plock { 64 uint64_t lock_owner; 65 int fd; /* fd for OFD locks */ 66}; 67 68struct lo_map_elem { 69 union { 70 struct lo_inode *inode; 71 struct lo_dirp *dirp; 72 int fd; 73 ssize_t freelist; 74 }; 75 bool in_use; 76}; 77 78/* Maps FUSE fh or ino values to internal objects */ 79struct lo_map { 80 struct lo_map_elem *elems; 81 size_t nelems; 82 ssize_t freelist; 83}; 84 85struct lo_key { 86 ino_t ino; 87 dev_t dev; 88 uint64_t mnt_id; 89}; 90 91struct lo_inode { 92 int fd; 93 94 /* 95 * Atomic reference count for this object. The nlookup field holds a 96 * reference and release it when nlookup reaches 0. 97 */ 98 gint refcount; 99 100 struct lo_key key; 101 102 /* 103 * This counter keeps the inode alive during the FUSE session. 104 * Incremented when the FUSE inode number is sent in a reply 105 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is 106 * released by a FUSE_FORGET request. 107 * 108 * Note that this value is untrusted because the client can manipulate 109 * it arbitrarily using FUSE_FORGET requests. 110 * 111 * Protected by lo->mutex. 112 */ 113 uint64_t nlookup; 114 115 fuse_ino_t fuse_ino; 116 pthread_mutex_t plock_mutex; 117 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ 118 119 mode_t filetype; 120}; 121 122struct lo_cred { 123 uid_t euid; 124 gid_t egid; 125 mode_t umask; 126}; 127 128enum { 129 CACHE_NONE, 130 CACHE_AUTO, 131 CACHE_ALWAYS, 132}; 133 134enum { 135 SANDBOX_NAMESPACE, 136 SANDBOX_CHROOT, 137}; 138 139typedef struct xattr_map_entry { 140 char *key; 141 char *prepend; 142 unsigned int flags; 143} XattrMapEntry; 144 145struct lo_data { 146 pthread_mutex_t mutex; 147 int sandbox; 148 int debug; 149 int writeback; 150 int flock; 151 int posix_lock; 152 int xattr; 153 char *xattrmap; 154 char *xattr_security_capability; 155 char *source; 156 char *modcaps; 157 double timeout; 158 int cache; 159 int timeout_set; 160 int readdirplus_set; 161 int readdirplus_clear; 162 int allow_direct_io; 163 int announce_submounts; 164 bool use_statx; 165 struct lo_inode root; 166 GHashTable *inodes; /* protected by lo->mutex */ 167 struct lo_map ino_map; /* protected by lo->mutex */ 168 struct lo_map dirp_map; /* protected by lo->mutex */ 169 struct lo_map fd_map; /* protected by lo->mutex */ 170 XattrMapEntry *xattr_map_list; 171 size_t xattr_map_nentries; 172 173 /* An O_PATH file descriptor to /proc/self/fd/ */ 174 int proc_self_fd; 175 int user_killpriv_v2, killpriv_v2; 176 /* If set, virtiofsd is responsible for setting umask during creation */ 177 bool change_umask; 178 int user_posix_acl, posix_acl; 179}; 180 181static const struct fuse_opt lo_opts[] = { 182 { "sandbox=namespace", 183 offsetof(struct lo_data, sandbox), 184 SANDBOX_NAMESPACE }, 185 { "sandbox=chroot", 186 offsetof(struct lo_data, sandbox), 187 SANDBOX_CHROOT }, 188 { "writeback", offsetof(struct lo_data, writeback), 1 }, 189 { "no_writeback", offsetof(struct lo_data, writeback), 0 }, 190 { "source=%s", offsetof(struct lo_data, source), 0 }, 191 { "flock", offsetof(struct lo_data, flock), 1 }, 192 { "no_flock", offsetof(struct lo_data, flock), 0 }, 193 { "posix_lock", offsetof(struct lo_data, posix_lock), 1 }, 194 { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 }, 195 { "xattr", offsetof(struct lo_data, xattr), 1 }, 196 { "no_xattr", offsetof(struct lo_data, xattr), 0 }, 197 { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 }, 198 { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 }, 199 { "timeout=%lf", offsetof(struct lo_data, timeout), 0 }, 200 { "timeout=", offsetof(struct lo_data, timeout_set), 1 }, 201 { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE }, 202 { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO }, 203 { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, 204 { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 }, 205 { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 }, 206 { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 }, 207 { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 }, 208 { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 }, 209 { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 }, 210 { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 }, 211 { "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 }, 212 { "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 }, 213 FUSE_OPT_END 214}; 215static bool use_syslog = false; 216static int current_log_level; 217static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, 218 uint64_t n); 219 220static struct { 221 pthread_mutex_t mutex; 222 void *saved; 223} cap; 224/* That we loaded cap-ng in the current thread from the saved */ 225static __thread bool cap_loaded = 0; 226 227static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st, 228 uint64_t mnt_id); 229static int xattr_map_client(const struct lo_data *lo, const char *client_name, 230 char **out_name); 231 232static bool is_dot_or_dotdot(const char *name) 233{ 234 return name[0] == '.' && 235 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')); 236} 237 238/* Is `path` a single path component that is not "." or ".."? */ 239static bool is_safe_path_component(const char *path) 240{ 241 if (strchr(path, '/')) { 242 return false; 243 } 244 245 return !is_dot_or_dotdot(path); 246} 247 248static bool is_empty(const char *name) 249{ 250 return name[0] == '\0'; 251} 252 253static struct lo_data *lo_data(fuse_req_t req) 254{ 255 return (struct lo_data *)fuse_req_userdata(req); 256} 257 258/* 259 * Load capng's state from our saved state if the current thread 260 * hadn't previously been loaded. 261 * returns 0 on success 262 */ 263static int load_capng(void) 264{ 265 if (!cap_loaded) { 266 pthread_mutex_lock(&cap.mutex); 267 capng_restore_state(&cap.saved); 268 /* 269 * restore_state free's the saved copy 270 * so make another. 271 */ 272 cap.saved = capng_save_state(); 273 if (!cap.saved) { 274 pthread_mutex_unlock(&cap.mutex); 275 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n"); 276 return -EINVAL; 277 } 278 pthread_mutex_unlock(&cap.mutex); 279 280 /* 281 * We want to use the loaded state for our pid, 282 * not the original 283 */ 284 capng_setpid(syscall(SYS_gettid)); 285 cap_loaded = true; 286 } 287 return 0; 288} 289 290/* 291 * Helpers for dropping and regaining effective capabilities. Returns 0 292 * on success, error otherwise 293 */ 294static int drop_effective_cap(const char *cap_name, bool *cap_dropped) 295{ 296 int cap, ret; 297 298 cap = capng_name_to_capability(cap_name); 299 if (cap < 0) { 300 ret = errno; 301 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", 302 cap_name, strerror(errno)); 303 goto out; 304 } 305 306 if (load_capng()) { 307 ret = errno; 308 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); 309 goto out; 310 } 311 312 /* We dont have this capability in effective set already. */ 313 if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) { 314 ret = 0; 315 goto out; 316 } 317 318 if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) { 319 ret = errno; 320 fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n"); 321 goto out; 322 } 323 324 if (capng_apply(CAPNG_SELECT_CAPS)) { 325 ret = errno; 326 fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n"); 327 goto out; 328 } 329 330 ret = 0; 331 if (cap_dropped) { 332 *cap_dropped = true; 333 } 334 335out: 336 return ret; 337} 338 339static int gain_effective_cap(const char *cap_name) 340{ 341 int cap; 342 int ret = 0; 343 344 cap = capng_name_to_capability(cap_name); 345 if (cap < 0) { 346 ret = errno; 347 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", 348 cap_name, strerror(errno)); 349 goto out; 350 } 351 352 if (load_capng()) { 353 ret = errno; 354 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); 355 goto out; 356 } 357 358 if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) { 359 ret = errno; 360 fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n"); 361 goto out; 362 } 363 364 if (capng_apply(CAPNG_SELECT_CAPS)) { 365 ret = errno; 366 fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n"); 367 goto out; 368 } 369 ret = 0; 370 371out: 372 return ret; 373} 374 375/* 376 * The host kernel normally drops security.capability xattr's on 377 * any write, however if we're remapping xattr names we need to drop 378 * whatever the clients security.capability is actually stored as. 379 */ 380static int drop_security_capability(const struct lo_data *lo, int fd) 381{ 382 if (!lo->xattr_security_capability) { 383 /* We didn't remap the name, let the host kernel do it */ 384 return 0; 385 } 386 if (!fremovexattr(fd, lo->xattr_security_capability)) { 387 /* All good */ 388 return 0; 389 } 390 391 switch (errno) { 392 case ENODATA: 393 /* Attribute didn't exist, that's fine */ 394 return 0; 395 396 case ENOTSUP: 397 /* FS didn't support attribute anyway, also fine */ 398 return 0; 399 400 default: 401 /* Hmm other error */ 402 return errno; 403 } 404} 405 406static void lo_map_init(struct lo_map *map) 407{ 408 map->elems = NULL; 409 map->nelems = 0; 410 map->freelist = -1; 411} 412 413static void lo_map_destroy(struct lo_map *map) 414{ 415 g_free(map->elems); 416} 417 418static int lo_map_grow(struct lo_map *map, size_t new_nelems) 419{ 420 struct lo_map_elem *new_elems; 421 size_t i; 422 423 if (new_nelems <= map->nelems) { 424 return 1; 425 } 426 427 new_elems = g_try_realloc_n(map->elems, new_nelems, sizeof(map->elems[0])); 428 if (!new_elems) { 429 return 0; 430 } 431 432 for (i = map->nelems; i < new_nelems; i++) { 433 new_elems[i].freelist = i + 1; 434 new_elems[i].in_use = false; 435 } 436 new_elems[new_nelems - 1].freelist = -1; 437 438 map->elems = new_elems; 439 map->freelist = map->nelems; 440 map->nelems = new_nelems; 441 return 1; 442} 443 444static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map) 445{ 446 struct lo_map_elem *elem; 447 448 if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) { 449 return NULL; 450 } 451 452 elem = &map->elems[map->freelist]; 453 map->freelist = elem->freelist; 454 455 elem->in_use = true; 456 457 return elem; 458} 459 460static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key) 461{ 462 ssize_t *prev; 463 464 if (!lo_map_grow(map, key + 1)) { 465 return NULL; 466 } 467 468 for (prev = &map->freelist; *prev != -1; 469 prev = &map->elems[*prev].freelist) { 470 if (*prev == key) { 471 struct lo_map_elem *elem = &map->elems[key]; 472 473 *prev = elem->freelist; 474 elem->in_use = true; 475 return elem; 476 } 477 } 478 return NULL; 479} 480 481static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key) 482{ 483 if (key >= map->nelems) { 484 return NULL; 485 } 486 if (!map->elems[key].in_use) { 487 return NULL; 488 } 489 return &map->elems[key]; 490} 491 492static void lo_map_remove(struct lo_map *map, size_t key) 493{ 494 struct lo_map_elem *elem; 495 496 if (key >= map->nelems) { 497 return; 498 } 499 500 elem = &map->elems[key]; 501 if (!elem->in_use) { 502 return; 503 } 504 505 elem->in_use = false; 506 507 elem->freelist = map->freelist; 508 map->freelist = key; 509} 510 511/* Assumes lo->mutex is held */ 512static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd) 513{ 514 struct lo_map_elem *elem; 515 516 elem = lo_map_alloc_elem(&lo->fd_map); 517 if (!elem) { 518 return -1; 519 } 520 521 elem->fd = fd; 522 return elem - lo->fd_map.elems; 523} 524 525/* Assumes lo->mutex is held */ 526static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp) 527{ 528 struct lo_map_elem *elem; 529 530 elem = lo_map_alloc_elem(&lo_data(req)->dirp_map); 531 if (!elem) { 532 return -1; 533 } 534 535 elem->dirp = dirp; 536 return elem - lo_data(req)->dirp_map.elems; 537} 538 539/* Assumes lo->mutex is held */ 540static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode) 541{ 542 struct lo_map_elem *elem; 543 544 elem = lo_map_alloc_elem(&lo_data(req)->ino_map); 545 if (!elem) { 546 return -1; 547 } 548 549 elem->inode = inode; 550 return elem - lo_data(req)->ino_map.elems; 551} 552 553static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep) 554{ 555 struct lo_inode *inode = *inodep; 556 557 if (!inode) { 558 return; 559 } 560 561 *inodep = NULL; 562 563 if (g_atomic_int_dec_and_test(&inode->refcount)) { 564 close(inode->fd); 565 free(inode); 566 } 567} 568 569/* Caller must release refcount using lo_inode_put() */ 570static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) 571{ 572 struct lo_data *lo = lo_data(req); 573 struct lo_map_elem *elem; 574 575 pthread_mutex_lock(&lo->mutex); 576 elem = lo_map_get(&lo->ino_map, ino); 577 if (elem) { 578 g_atomic_int_inc(&elem->inode->refcount); 579 } 580 pthread_mutex_unlock(&lo->mutex); 581 582 if (!elem) { 583 return NULL; 584 } 585 586 return elem->inode; 587} 588 589/* 590 * TODO Remove this helper and force callers to hold an inode refcount until 591 * they are done with the fd. This will be done in a later patch to make 592 * review easier. 593 */ 594static int lo_fd(fuse_req_t req, fuse_ino_t ino) 595{ 596 struct lo_inode *inode = lo_inode(req, ino); 597 int fd; 598 599 if (!inode) { 600 return -1; 601 } 602 603 fd = inode->fd; 604 lo_inode_put(lo_data(req), &inode); 605 return fd; 606} 607 608/* 609 * Open a file descriptor for an inode. Returns -EBADF if the inode is not a 610 * regular file or a directory. 611 * 612 * Use this helper function instead of raw openat(2) to prevent security issues 613 * when a malicious client opens special files such as block device nodes. 614 * Symlink inodes are also rejected since symlinks must already have been 615 * traversed on the client side. 616 */ 617static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode, 618 int open_flags) 619{ 620 g_autofree char *fd_str = g_strdup_printf("%d", inode->fd); 621 int fd; 622 623 if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) { 624 return -EBADF; 625 } 626 627 /* 628 * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier 629 * that the inode is not a special file but if an external process races 630 * with us then symlinks are traversed here. It is not possible to escape 631 * the shared directory since it is mounted as "/" though. 632 */ 633 fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW); 634 if (fd < 0) { 635 return -errno; 636 } 637 return fd; 638} 639 640static void lo_init(void *userdata, struct fuse_conn_info *conn) 641{ 642 struct lo_data *lo = (struct lo_data *)userdata; 643 644 if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) { 645 conn->want |= FUSE_CAP_EXPORT_SUPPORT; 646 } 647 648 if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) { 649 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); 650 conn->want |= FUSE_CAP_WRITEBACK_CACHE; 651 } 652 if (conn->capable & FUSE_CAP_FLOCK_LOCKS) { 653 if (lo->flock) { 654 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); 655 conn->want |= FUSE_CAP_FLOCK_LOCKS; 656 } else { 657 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n"); 658 conn->want &= ~FUSE_CAP_FLOCK_LOCKS; 659 } 660 } 661 662 if (conn->capable & FUSE_CAP_POSIX_LOCKS) { 663 if (lo->posix_lock) { 664 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n"); 665 conn->want |= FUSE_CAP_POSIX_LOCKS; 666 } else { 667 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n"); 668 conn->want &= ~FUSE_CAP_POSIX_LOCKS; 669 } 670 } 671 672 if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) || 673 lo->readdirplus_clear) { 674 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); 675 conn->want &= ~FUSE_CAP_READDIRPLUS; 676 } 677 678 if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) { 679 fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client " 680 "does not support it\n"); 681 lo->announce_submounts = false; 682 } 683 684 if (lo->user_killpriv_v2 == 1) { 685 /* 686 * User explicitly asked for this option. Enable it unconditionally. 687 * If connection does not have this capability, it should fail 688 * in fuse_lowlevel.c 689 */ 690 fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n"); 691 conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2; 692 lo->killpriv_v2 = 1; 693 } else if (lo->user_killpriv_v2 == -1 && 694 conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) { 695 /* 696 * User did not specify a value for killpriv_v2. By default enable it 697 * if connection offers this capability 698 */ 699 fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n"); 700 conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2; 701 lo->killpriv_v2 = 1; 702 } else { 703 /* 704 * Either user specified to disable killpriv_v2, or connection does 705 * not offer this capability. Disable killpriv_v2 in both the cases 706 */ 707 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n"); 708 conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2; 709 lo->killpriv_v2 = 0; 710 } 711 712 if (lo->user_posix_acl == 1) { 713 /* 714 * User explicitly asked for this option. Enable it unconditionally. 715 * If connection does not have this capability, print error message 716 * now. It will fail later in fuse_lowlevel.c 717 */ 718 if (!(conn->capable & FUSE_CAP_POSIX_ACL) || 719 !(conn->capable & FUSE_CAP_DONT_MASK) || 720 !(conn->capable & FUSE_CAP_SETXATTR_EXT)) { 721 fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable posix acl." 722 " kernel does not support FUSE_POSIX_ACL, FUSE_DONT_MASK" 723 " or FUSE_SETXATTR_EXT capability.\n"); 724 } else { 725 fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling posix acl\n"); 726 } 727 728 conn->want |= FUSE_CAP_POSIX_ACL | FUSE_CAP_DONT_MASK | 729 FUSE_CAP_SETXATTR_EXT; 730 lo->change_umask = true; 731 lo->posix_acl = true; 732 } else { 733 /* User either did not specify anything or wants it disabled */ 734 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n"); 735 conn->want &= ~FUSE_CAP_POSIX_ACL; 736 } 737} 738 739static void lo_getattr(fuse_req_t req, fuse_ino_t ino, 740 struct fuse_file_info *fi) 741{ 742 int res; 743 struct stat buf; 744 struct lo_data *lo = lo_data(req); 745 746 (void)fi; 747 748 res = 749 fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); 750 if (res == -1) { 751 return (void)fuse_reply_err(req, errno); 752 } 753 754 fuse_reply_attr(req, &buf, lo->timeout); 755} 756 757static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi) 758{ 759 struct lo_data *lo = lo_data(req); 760 struct lo_map_elem *elem; 761 762 pthread_mutex_lock(&lo->mutex); 763 elem = lo_map_get(&lo->fd_map, fi->fh); 764 pthread_mutex_unlock(&lo->mutex); 765 766 if (!elem) { 767 return -1; 768 } 769 770 return elem->fd; 771} 772 773static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, 774 int valid, struct fuse_file_info *fi) 775{ 776 int saverr; 777 char procname[64]; 778 struct lo_data *lo = lo_data(req); 779 struct lo_inode *inode; 780 int ifd; 781 int res; 782 int fd = -1; 783 784 inode = lo_inode(req, ino); 785 if (!inode) { 786 fuse_reply_err(req, EBADF); 787 return; 788 } 789 790 ifd = inode->fd; 791 792 /* If fi->fh is invalid we'll report EBADF later */ 793 if (fi) { 794 fd = lo_fi_fd(req, fi); 795 } 796 797 if (valid & FUSE_SET_ATTR_MODE) { 798 if (fi) { 799 res = fchmod(fd, attr->st_mode); 800 } else { 801 sprintf(procname, "%i", ifd); 802 res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0); 803 } 804 if (res == -1) { 805 saverr = errno; 806 goto out_err; 807 } 808 } 809 if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) { 810 uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1; 811 gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1; 812 813 saverr = drop_security_capability(lo, ifd); 814 if (saverr) { 815 goto out_err; 816 } 817 818 res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); 819 if (res == -1) { 820 saverr = errno; 821 goto out_err; 822 } 823 } 824 if (valid & FUSE_SET_ATTR_SIZE) { 825 int truncfd; 826 bool kill_suidgid; 827 bool cap_fsetid_dropped = false; 828 829 kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID); 830 if (fi) { 831 truncfd = fd; 832 } else { 833 truncfd = lo_inode_open(lo, inode, O_RDWR); 834 if (truncfd < 0) { 835 saverr = -truncfd; 836 goto out_err; 837 } 838 } 839 840 saverr = drop_security_capability(lo, truncfd); 841 if (saverr) { 842 if (!fi) { 843 close(truncfd); 844 } 845 goto out_err; 846 } 847 848 if (kill_suidgid) { 849 res = drop_effective_cap("FSETID", &cap_fsetid_dropped); 850 if (res != 0) { 851 saverr = res; 852 if (!fi) { 853 close(truncfd); 854 } 855 goto out_err; 856 } 857 } 858 859 res = ftruncate(truncfd, attr->st_size); 860 saverr = res == -1 ? errno : 0; 861 862 if (cap_fsetid_dropped) { 863 if (gain_effective_cap("FSETID")) { 864 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); 865 } 866 } 867 if (!fi) { 868 close(truncfd); 869 } 870 if (res == -1) { 871 goto out_err; 872 } 873 } 874 if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) { 875 struct timespec tv[2]; 876 877 tv[0].tv_sec = 0; 878 tv[1].tv_sec = 0; 879 tv[0].tv_nsec = UTIME_OMIT; 880 tv[1].tv_nsec = UTIME_OMIT; 881 882 if (valid & FUSE_SET_ATTR_ATIME_NOW) { 883 tv[0].tv_nsec = UTIME_NOW; 884 } else if (valid & FUSE_SET_ATTR_ATIME) { 885 tv[0] = attr->st_atim; 886 } 887 888 if (valid & FUSE_SET_ATTR_MTIME_NOW) { 889 tv[1].tv_nsec = UTIME_NOW; 890 } else if (valid & FUSE_SET_ATTR_MTIME) { 891 tv[1] = attr->st_mtim; 892 } 893 894 if (fi) { 895 res = futimens(fd, tv); 896 } else { 897 sprintf(procname, "%i", inode->fd); 898 res = utimensat(lo->proc_self_fd, procname, tv, 0); 899 } 900 if (res == -1) { 901 saverr = errno; 902 goto out_err; 903 } 904 } 905 lo_inode_put(lo, &inode); 906 907 return lo_getattr(req, ino, fi); 908 909out_err: 910 lo_inode_put(lo, &inode); 911 fuse_reply_err(req, saverr); 912} 913 914static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st, 915 uint64_t mnt_id) 916{ 917 struct lo_inode *p; 918 struct lo_key key = { 919 .ino = st->st_ino, 920 .dev = st->st_dev, 921 .mnt_id = mnt_id, 922 }; 923 924 pthread_mutex_lock(&lo->mutex); 925 p = g_hash_table_lookup(lo->inodes, &key); 926 if (p) { 927 assert(p->nlookup > 0); 928 p->nlookup++; 929 g_atomic_int_inc(&p->refcount); 930 } 931 pthread_mutex_unlock(&lo->mutex); 932 933 return p; 934} 935 936/* value_destroy_func for posix_locks GHashTable */ 937static void posix_locks_value_destroy(gpointer data) 938{ 939 struct lo_inode_plock *plock = data; 940 941 /* 942 * We had used open() for locks and had only one fd. So 943 * closing this fd should release all OFD locks. 944 */ 945 close(plock->fd); 946 free(plock); 947} 948 949static int do_statx(struct lo_data *lo, int dirfd, const char *pathname, 950 struct stat *statbuf, int flags, uint64_t *mnt_id) 951{ 952 int res; 953 954#if defined(CONFIG_STATX) && defined(STATX_MNT_ID) 955 if (lo->use_statx) { 956 struct statx statxbuf; 957 958 res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID, 959 &statxbuf); 960 if (!res) { 961 memset(statbuf, 0, sizeof(*statbuf)); 962 statbuf->st_dev = makedev(statxbuf.stx_dev_major, 963 statxbuf.stx_dev_minor); 964 statbuf->st_ino = statxbuf.stx_ino; 965 statbuf->st_mode = statxbuf.stx_mode; 966 statbuf->st_nlink = statxbuf.stx_nlink; 967 statbuf->st_uid = statxbuf.stx_uid; 968 statbuf->st_gid = statxbuf.stx_gid; 969 statbuf->st_rdev = makedev(statxbuf.stx_rdev_major, 970 statxbuf.stx_rdev_minor); 971 statbuf->st_size = statxbuf.stx_size; 972 statbuf->st_blksize = statxbuf.stx_blksize; 973 statbuf->st_blocks = statxbuf.stx_blocks; 974 statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec; 975 statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec; 976 statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec; 977 statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec; 978 statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec; 979 statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec; 980 981 if (statxbuf.stx_mask & STATX_MNT_ID) { 982 *mnt_id = statxbuf.stx_mnt_id; 983 } else { 984 *mnt_id = 0; 985 } 986 return 0; 987 } else if (errno != ENOSYS) { 988 return -1; 989 } 990 lo->use_statx = false; 991 /* fallback */ 992 } 993#endif 994 res = fstatat(dirfd, pathname, statbuf, flags); 995 if (res == -1) { 996 return -1; 997 } 998 *mnt_id = 0; 999 1000 return 0; 1001} 1002 1003/* 1004 * Increments nlookup on the inode on success. unref_inode_lolocked() must be 1005 * called eventually to decrement nlookup again. If inodep is non-NULL, the 1006 * inode pointer is stored and the caller must call lo_inode_put(). 1007 */ 1008static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, 1009 struct fuse_entry_param *e, 1010 struct lo_inode **inodep) 1011{ 1012 int newfd; 1013 int res; 1014 int saverr; 1015 uint64_t mnt_id; 1016 struct lo_data *lo = lo_data(req); 1017 struct lo_inode *inode = NULL; 1018 struct lo_inode *dir = lo_inode(req, parent); 1019 1020 if (inodep) { 1021 *inodep = NULL; /* in case there is an error */ 1022 } 1023 1024 /* 1025 * name_to_handle_at() and open_by_handle_at() can reach here with fuse 1026 * mount point in guest, but we don't have its inode info in the 1027 * ino_map. 1028 */ 1029 if (!dir) { 1030 return ENOENT; 1031 } 1032 1033 memset(e, 0, sizeof(*e)); 1034 e->attr_timeout = lo->timeout; 1035 e->entry_timeout = lo->timeout; 1036 1037 /* Do not allow escaping root directory */ 1038 if (dir == &lo->root && strcmp(name, "..") == 0) { 1039 name = "."; 1040 } 1041 1042 newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW); 1043 if (newfd == -1) { 1044 goto out_err; 1045 } 1046 1047 res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, 1048 &mnt_id); 1049 if (res == -1) { 1050 goto out_err; 1051 } 1052 1053 if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts && 1054 (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) { 1055 e->attr_flags |= FUSE_ATTR_SUBMOUNT; 1056 } 1057 1058 inode = lo_find(lo, &e->attr, mnt_id); 1059 if (inode) { 1060 close(newfd); 1061 } else { 1062 inode = calloc(1, sizeof(struct lo_inode)); 1063 if (!inode) { 1064 goto out_err; 1065 } 1066 1067 /* cache only filetype */ 1068 inode->filetype = (e->attr.st_mode & S_IFMT); 1069 1070 /* 1071 * One for the caller and one for nlookup (released in 1072 * unref_inode_lolocked()) 1073 */ 1074 g_atomic_int_set(&inode->refcount, 2); 1075 1076 inode->nlookup = 1; 1077 inode->fd = newfd; 1078 inode->key.ino = e->attr.st_ino; 1079 inode->key.dev = e->attr.st_dev; 1080 inode->key.mnt_id = mnt_id; 1081 if (lo->posix_lock) { 1082 pthread_mutex_init(&inode->plock_mutex, NULL); 1083 inode->posix_locks = g_hash_table_new_full( 1084 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy); 1085 } 1086 pthread_mutex_lock(&lo->mutex); 1087 inode->fuse_ino = lo_add_inode_mapping(req, inode); 1088 g_hash_table_insert(lo->inodes, &inode->key, inode); 1089 pthread_mutex_unlock(&lo->mutex); 1090 } 1091 e->ino = inode->fuse_ino; 1092 1093 /* Transfer ownership of inode pointer to caller or drop it */ 1094 if (inodep) { 1095 *inodep = inode; 1096 } else { 1097 lo_inode_put(lo, &inode); 1098 } 1099 1100 lo_inode_put(lo, &dir); 1101 1102 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, 1103 name, (unsigned long long)e->ino); 1104 1105 return 0; 1106 1107out_err: 1108 saverr = errno; 1109 if (newfd != -1) { 1110 close(newfd); 1111 } 1112 lo_inode_put(lo, &inode); 1113 lo_inode_put(lo, &dir); 1114 return saverr; 1115} 1116 1117static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) 1118{ 1119 struct fuse_entry_param e; 1120 int err; 1121 1122 fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent, 1123 name); 1124 1125 if (is_empty(name)) { 1126 fuse_reply_err(req, ENOENT); 1127 return; 1128 } 1129 1130 /* 1131 * Don't use is_safe_path_component(), allow "." and ".." for NFS export 1132 * support. 1133 */ 1134 if (strchr(name, '/')) { 1135 fuse_reply_err(req, EINVAL); 1136 return; 1137 } 1138 1139 err = lo_do_lookup(req, parent, name, &e, NULL); 1140 if (err) { 1141 fuse_reply_err(req, err); 1142 } else { 1143 fuse_reply_entry(req, &e); 1144 } 1145} 1146 1147/* 1148 * On some archs, setres*id is limited to 2^16 but they 1149 * provide setres*id32 variants that allow 2^32. 1150 * Others just let setres*id do 2^32 anyway. 1151 */ 1152#ifdef SYS_setresgid32 1153#define OURSYS_setresgid SYS_setresgid32 1154#else 1155#define OURSYS_setresgid SYS_setresgid 1156#endif 1157 1158#ifdef SYS_setresuid32 1159#define OURSYS_setresuid SYS_setresuid32 1160#else 1161#define OURSYS_setresuid SYS_setresuid 1162#endif 1163 1164/* 1165 * Change to uid/gid of caller so that file is created with 1166 * ownership of caller. 1167 * TODO: What about selinux context? 1168 */ 1169static int lo_change_cred(fuse_req_t req, struct lo_cred *old, 1170 bool change_umask) 1171{ 1172 int res; 1173 1174 old->euid = geteuid(); 1175 old->egid = getegid(); 1176 1177 res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1); 1178 if (res == -1) { 1179 return errno; 1180 } 1181 1182 res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1); 1183 if (res == -1) { 1184 int errno_save = errno; 1185 1186 syscall(OURSYS_setresgid, -1, old->egid, -1); 1187 return errno_save; 1188 } 1189 1190 if (change_umask) { 1191 old->umask = umask(req->ctx.umask); 1192 } 1193 return 0; 1194} 1195 1196/* Regain Privileges */ 1197static void lo_restore_cred(struct lo_cred *old, bool restore_umask) 1198{ 1199 int res; 1200 1201 res = syscall(OURSYS_setresuid, -1, old->euid, -1); 1202 if (res == -1) { 1203 fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid); 1204 exit(1); 1205 } 1206 1207 res = syscall(OURSYS_setresgid, -1, old->egid, -1); 1208 if (res == -1) { 1209 fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid); 1210 exit(1); 1211 } 1212 1213 if (restore_umask) 1214 umask(old->umask); 1215} 1216 1217/* 1218 * A helper to change cred and drop capability. Returns 0 on success and 1219 * errno on error 1220 */ 1221static int lo_drop_cap_change_cred(fuse_req_t req, struct lo_cred *old, 1222 bool change_umask, const char *cap_name, 1223 bool *cap_dropped) 1224{ 1225 int ret; 1226 bool __cap_dropped; 1227 1228 assert(cap_name); 1229 1230 ret = drop_effective_cap(cap_name, &__cap_dropped); 1231 if (ret) { 1232 return ret; 1233 } 1234 1235 ret = lo_change_cred(req, old, change_umask); 1236 if (ret) { 1237 if (__cap_dropped) { 1238 if (gain_effective_cap(cap_name)) { 1239 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name); 1240 } 1241 } 1242 } 1243 1244 if (cap_dropped) { 1245 *cap_dropped = __cap_dropped; 1246 } 1247 return ret; 1248} 1249 1250static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask, 1251 const char *cap_name) 1252{ 1253 assert(cap_name); 1254 1255 lo_restore_cred(old, restore_umask); 1256 1257 if (gain_effective_cap(cap_name)) { 1258 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name); 1259 } 1260} 1261 1262static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, 1263 const char *name, mode_t mode, dev_t rdev, 1264 const char *link) 1265{ 1266 int res; 1267 int saverr; 1268 struct lo_data *lo = lo_data(req); 1269 struct lo_inode *dir; 1270 struct fuse_entry_param e; 1271 struct lo_cred old = {}; 1272 1273 if (is_empty(name)) { 1274 fuse_reply_err(req, ENOENT); 1275 return; 1276 } 1277 1278 if (!is_safe_path_component(name)) { 1279 fuse_reply_err(req, EINVAL); 1280 return; 1281 } 1282 1283 dir = lo_inode(req, parent); 1284 if (!dir) { 1285 fuse_reply_err(req, EBADF); 1286 return; 1287 } 1288 1289 saverr = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode)); 1290 if (saverr) { 1291 goto out; 1292 } 1293 1294 res = mknod_wrapper(dir->fd, name, link, mode, rdev); 1295 1296 saverr = errno; 1297 1298 lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode)); 1299 1300 if (res == -1) { 1301 goto out; 1302 } 1303 1304 saverr = lo_do_lookup(req, parent, name, &e, NULL); 1305 if (saverr) { 1306 goto out; 1307 } 1308 1309 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, 1310 name, (unsigned long long)e.ino); 1311 1312 fuse_reply_entry(req, &e); 1313 lo_inode_put(lo, &dir); 1314 return; 1315 1316out: 1317 lo_inode_put(lo, &dir); 1318 fuse_reply_err(req, saverr); 1319} 1320 1321static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, 1322 mode_t mode, dev_t rdev) 1323{ 1324 lo_mknod_symlink(req, parent, name, mode, rdev, NULL); 1325} 1326 1327static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, 1328 mode_t mode) 1329{ 1330 lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL); 1331} 1332 1333static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent, 1334 const char *name) 1335{ 1336 lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); 1337} 1338 1339static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, 1340 const char *name) 1341{ 1342 int res; 1343 struct lo_data *lo = lo_data(req); 1344 struct lo_inode *parent_inode; 1345 struct lo_inode *inode; 1346 struct fuse_entry_param e; 1347 char procname[64]; 1348 int saverr; 1349 1350 if (is_empty(name)) { 1351 fuse_reply_err(req, ENOENT); 1352 return; 1353 } 1354 1355 if (!is_safe_path_component(name)) { 1356 fuse_reply_err(req, EINVAL); 1357 return; 1358 } 1359 1360 parent_inode = lo_inode(req, parent); 1361 inode = lo_inode(req, ino); 1362 if (!parent_inode || !inode) { 1363 errno = EBADF; 1364 goto out_err; 1365 } 1366 1367 memset(&e, 0, sizeof(struct fuse_entry_param)); 1368 e.attr_timeout = lo->timeout; 1369 e.entry_timeout = lo->timeout; 1370 1371 sprintf(procname, "%i", inode->fd); 1372 res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name, 1373 AT_SYMLINK_FOLLOW); 1374 if (res == -1) { 1375 goto out_err; 1376 } 1377 1378 res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); 1379 if (res == -1) { 1380 goto out_err; 1381 } 1382 1383 pthread_mutex_lock(&lo->mutex); 1384 inode->nlookup++; 1385 pthread_mutex_unlock(&lo->mutex); 1386 e.ino = inode->fuse_ino; 1387 1388 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, 1389 name, (unsigned long long)e.ino); 1390 1391 fuse_reply_entry(req, &e); 1392 lo_inode_put(lo, &parent_inode); 1393 lo_inode_put(lo, &inode); 1394 return; 1395 1396out_err: 1397 saverr = errno; 1398 lo_inode_put(lo, &parent_inode); 1399 lo_inode_put(lo, &inode); 1400 fuse_reply_err(req, saverr); 1401} 1402 1403/* Increments nlookup and caller must release refcount using lo_inode_put() */ 1404static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent, 1405 const char *name) 1406{ 1407 int res; 1408 uint64_t mnt_id; 1409 struct stat attr; 1410 struct lo_data *lo = lo_data(req); 1411 struct lo_inode *dir = lo_inode(req, parent); 1412 1413 if (!dir) { 1414 return NULL; 1415 } 1416 1417 res = do_statx(lo, dir->fd, name, &attr, AT_SYMLINK_NOFOLLOW, &mnt_id); 1418 lo_inode_put(lo, &dir); 1419 if (res == -1) { 1420 return NULL; 1421 } 1422 1423 return lo_find(lo, &attr, mnt_id); 1424} 1425 1426static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) 1427{ 1428 int res; 1429 struct lo_inode *inode; 1430 struct lo_data *lo = lo_data(req); 1431 1432 if (is_empty(name)) { 1433 fuse_reply_err(req, ENOENT); 1434 return; 1435 } 1436 1437 if (!is_safe_path_component(name)) { 1438 fuse_reply_err(req, EINVAL); 1439 return; 1440 } 1441 1442 inode = lookup_name(req, parent, name); 1443 if (!inode) { 1444 fuse_reply_err(req, EIO); 1445 return; 1446 } 1447 1448 res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); 1449 1450 fuse_reply_err(req, res == -1 ? errno : 0); 1451 unref_inode_lolocked(lo, inode, 1); 1452 lo_inode_put(lo, &inode); 1453} 1454 1455static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, 1456 fuse_ino_t newparent, const char *newname, 1457 unsigned int flags) 1458{ 1459 int res; 1460 struct lo_inode *parent_inode; 1461 struct lo_inode *newparent_inode; 1462 struct lo_inode *oldinode = NULL; 1463 struct lo_inode *newinode = NULL; 1464 struct lo_data *lo = lo_data(req); 1465 1466 if (is_empty(name) || is_empty(newname)) { 1467 fuse_reply_err(req, ENOENT); 1468 return; 1469 } 1470 1471 if (!is_safe_path_component(name) || !is_safe_path_component(newname)) { 1472 fuse_reply_err(req, EINVAL); 1473 return; 1474 } 1475 1476 parent_inode = lo_inode(req, parent); 1477 newparent_inode = lo_inode(req, newparent); 1478 if (!parent_inode || !newparent_inode) { 1479 fuse_reply_err(req, EBADF); 1480 goto out; 1481 } 1482 1483 oldinode = lookup_name(req, parent, name); 1484 newinode = lookup_name(req, newparent, newname); 1485 1486 if (!oldinode) { 1487 fuse_reply_err(req, EIO); 1488 goto out; 1489 } 1490 1491 if (flags) { 1492#ifndef SYS_renameat2 1493 fuse_reply_err(req, EINVAL); 1494#else 1495 res = syscall(SYS_renameat2, parent_inode->fd, name, 1496 newparent_inode->fd, newname, flags); 1497 if (res == -1 && errno == ENOSYS) { 1498 fuse_reply_err(req, EINVAL); 1499 } else { 1500 fuse_reply_err(req, res == -1 ? errno : 0); 1501 } 1502#endif 1503 goto out; 1504 } 1505 1506 res = renameat(parent_inode->fd, name, newparent_inode->fd, newname); 1507 1508 fuse_reply_err(req, res == -1 ? errno : 0); 1509out: 1510 unref_inode_lolocked(lo, oldinode, 1); 1511 unref_inode_lolocked(lo, newinode, 1); 1512 lo_inode_put(lo, &oldinode); 1513 lo_inode_put(lo, &newinode); 1514 lo_inode_put(lo, &parent_inode); 1515 lo_inode_put(lo, &newparent_inode); 1516} 1517 1518static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) 1519{ 1520 int res; 1521 struct lo_inode *inode; 1522 struct lo_data *lo = lo_data(req); 1523 1524 if (is_empty(name)) { 1525 fuse_reply_err(req, ENOENT); 1526 return; 1527 } 1528 1529 if (!is_safe_path_component(name)) { 1530 fuse_reply_err(req, EINVAL); 1531 return; 1532 } 1533 1534 inode = lookup_name(req, parent, name); 1535 if (!inode) { 1536 fuse_reply_err(req, EIO); 1537 return; 1538 } 1539 1540 res = unlinkat(lo_fd(req, parent), name, 0); 1541 1542 fuse_reply_err(req, res == -1 ? errno : 0); 1543 unref_inode_lolocked(lo, inode, 1); 1544 lo_inode_put(lo, &inode); 1545} 1546 1547/* To be called with lo->mutex held */ 1548static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) 1549{ 1550 if (!inode) { 1551 return; 1552 } 1553 1554 assert(inode->nlookup >= n); 1555 inode->nlookup -= n; 1556 if (!inode->nlookup) { 1557 lo_map_remove(&lo->ino_map, inode->fuse_ino); 1558 g_hash_table_remove(lo->inodes, &inode->key); 1559 if (lo->posix_lock) { 1560 if (g_hash_table_size(inode->posix_locks)) { 1561 fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n"); 1562 } 1563 g_hash_table_destroy(inode->posix_locks); 1564 pthread_mutex_destroy(&inode->plock_mutex); 1565 } 1566 /* Drop our refcount from lo_do_lookup() */ 1567 lo_inode_put(lo, &inode); 1568 } 1569} 1570 1571static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, 1572 uint64_t n) 1573{ 1574 if (!inode) { 1575 return; 1576 } 1577 1578 pthread_mutex_lock(&lo->mutex); 1579 unref_inode(lo, inode, n); 1580 pthread_mutex_unlock(&lo->mutex); 1581} 1582 1583static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) 1584{ 1585 struct lo_data *lo = lo_data(req); 1586 struct lo_inode *inode; 1587 1588 inode = lo_inode(req, ino); 1589 if (!inode) { 1590 return; 1591 } 1592 1593 fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", 1594 (unsigned long long)ino, (unsigned long long)inode->nlookup, 1595 (unsigned long long)nlookup); 1596 1597 unref_inode_lolocked(lo, inode, nlookup); 1598 lo_inode_put(lo, &inode); 1599} 1600 1601static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) 1602{ 1603 lo_forget_one(req, ino, nlookup); 1604 fuse_reply_none(req); 1605} 1606 1607static void lo_forget_multi(fuse_req_t req, size_t count, 1608 struct fuse_forget_data *forgets) 1609{ 1610 int i; 1611 1612 for (i = 0; i < count; i++) { 1613 lo_forget_one(req, forgets[i].ino, forgets[i].nlookup); 1614 } 1615 fuse_reply_none(req); 1616} 1617 1618static void lo_readlink(fuse_req_t req, fuse_ino_t ino) 1619{ 1620 char buf[PATH_MAX + 1]; 1621 int res; 1622 1623 res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf)); 1624 if (res == -1) { 1625 return (void)fuse_reply_err(req, errno); 1626 } 1627 1628 if (res == sizeof(buf)) { 1629 return (void)fuse_reply_err(req, ENAMETOOLONG); 1630 } 1631 1632 buf[res] = '\0'; 1633 1634 fuse_reply_readlink(req, buf); 1635} 1636 1637struct lo_dirp { 1638 gint refcount; 1639 DIR *dp; 1640 struct dirent *entry; 1641 off_t offset; 1642}; 1643 1644static void lo_dirp_put(struct lo_dirp **dp) 1645{ 1646 struct lo_dirp *d = *dp; 1647 1648 if (!d) { 1649 return; 1650 } 1651 *dp = NULL; 1652 1653 if (g_atomic_int_dec_and_test(&d->refcount)) { 1654 closedir(d->dp); 1655 free(d); 1656 } 1657} 1658 1659/* Call lo_dirp_put() on the return value when no longer needed */ 1660static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi) 1661{ 1662 struct lo_data *lo = lo_data(req); 1663 struct lo_map_elem *elem; 1664 1665 pthread_mutex_lock(&lo->mutex); 1666 elem = lo_map_get(&lo->dirp_map, fi->fh); 1667 if (elem) { 1668 g_atomic_int_inc(&elem->dirp->refcount); 1669 } 1670 pthread_mutex_unlock(&lo->mutex); 1671 if (!elem) { 1672 return NULL; 1673 } 1674 1675 return elem->dirp; 1676} 1677 1678static void lo_opendir(fuse_req_t req, fuse_ino_t ino, 1679 struct fuse_file_info *fi) 1680{ 1681 int error = ENOMEM; 1682 struct lo_data *lo = lo_data(req); 1683 struct lo_dirp *d; 1684 int fd; 1685 ssize_t fh; 1686 1687 d = calloc(1, sizeof(struct lo_dirp)); 1688 if (d == NULL) { 1689 goto out_err; 1690 } 1691 1692 fd = openat(lo_fd(req, ino), ".", O_RDONLY); 1693 if (fd == -1) { 1694 goto out_errno; 1695 } 1696 1697 d->dp = fdopendir(fd); 1698 if (d->dp == NULL) { 1699 goto out_errno; 1700 } 1701 1702 d->offset = 0; 1703 d->entry = NULL; 1704 1705 g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */ 1706 pthread_mutex_lock(&lo->mutex); 1707 fh = lo_add_dirp_mapping(req, d); 1708 pthread_mutex_unlock(&lo->mutex); 1709 if (fh == -1) { 1710 goto out_err; 1711 } 1712 1713 fi->fh = fh; 1714 if (lo->cache == CACHE_ALWAYS) { 1715 fi->cache_readdir = 1; 1716 } 1717 fuse_reply_open(req, fi); 1718 return; 1719 1720out_errno: 1721 error = errno; 1722out_err: 1723 if (d) { 1724 if (d->dp) { 1725 closedir(d->dp); 1726 } else if (fd != -1) { 1727 close(fd); 1728 } 1729 free(d); 1730 } 1731 fuse_reply_err(req, error); 1732} 1733 1734static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, 1735 off_t offset, struct fuse_file_info *fi, int plus) 1736{ 1737 struct lo_data *lo = lo_data(req); 1738 struct lo_dirp *d = NULL; 1739 struct lo_inode *dinode; 1740 g_autofree char *buf = NULL; 1741 char *p; 1742 size_t rem = size; 1743 int err = EBADF; 1744 1745 dinode = lo_inode(req, ino); 1746 if (!dinode) { 1747 goto error; 1748 } 1749 1750 d = lo_dirp(req, fi); 1751 if (!d) { 1752 goto error; 1753 } 1754 1755 err = ENOMEM; 1756 buf = g_try_malloc0(size); 1757 if (!buf) { 1758 goto error; 1759 } 1760 p = buf; 1761 1762 if (offset != d->offset) { 1763 seekdir(d->dp, offset); 1764 d->entry = NULL; 1765 d->offset = offset; 1766 } 1767 while (1) { 1768 size_t entsize; 1769 off_t nextoff; 1770 const char *name; 1771 1772 if (!d->entry) { 1773 errno = 0; 1774 d->entry = readdir(d->dp); 1775 if (!d->entry) { 1776 if (errno) { /* Error */ 1777 err = errno; 1778 goto error; 1779 } else { /* End of stream */ 1780 break; 1781 } 1782 } 1783 } 1784 nextoff = d->entry->d_off; 1785 name = d->entry->d_name; 1786 1787 fuse_ino_t entry_ino = 0; 1788 struct fuse_entry_param e = (struct fuse_entry_param){ 1789 .attr.st_ino = d->entry->d_ino, 1790 .attr.st_mode = d->entry->d_type << 12, 1791 }; 1792 1793 /* Hide root's parent directory */ 1794 if (dinode == &lo->root && strcmp(name, "..") == 0) { 1795 e.attr.st_ino = lo->root.key.ino; 1796 e.attr.st_mode = DT_DIR << 12; 1797 } 1798 1799 if (plus) { 1800 if (!is_dot_or_dotdot(name)) { 1801 err = lo_do_lookup(req, ino, name, &e, NULL); 1802 if (err) { 1803 goto error; 1804 } 1805 entry_ino = e.ino; 1806 } 1807 1808 entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff); 1809 } else { 1810 entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff); 1811 } 1812 if (entsize > rem) { 1813 if (entry_ino != 0) { 1814 lo_forget_one(req, entry_ino, 1); 1815 } 1816 break; 1817 } 1818 1819 p += entsize; 1820 rem -= entsize; 1821 1822 d->entry = NULL; 1823 d->offset = nextoff; 1824 } 1825 1826 err = 0; 1827error: 1828 lo_dirp_put(&d); 1829 lo_inode_put(lo, &dinode); 1830 1831 /* 1832 * If there's an error, we can only signal it if we haven't stored 1833 * any entries yet - otherwise we'd end up with wrong lookup 1834 * counts for the entries that are already in the buffer. So we 1835 * return what we've collected until that point. 1836 */ 1837 if (err && rem == size) { 1838 fuse_reply_err(req, err); 1839 } else { 1840 fuse_reply_buf(req, buf, size - rem); 1841 } 1842} 1843 1844static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, 1845 off_t offset, struct fuse_file_info *fi) 1846{ 1847 lo_do_readdir(req, ino, size, offset, fi, 0); 1848} 1849 1850static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size, 1851 off_t offset, struct fuse_file_info *fi) 1852{ 1853 lo_do_readdir(req, ino, size, offset, fi, 1); 1854} 1855 1856static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, 1857 struct fuse_file_info *fi) 1858{ 1859 struct lo_data *lo = lo_data(req); 1860 struct lo_map_elem *elem; 1861 struct lo_dirp *d; 1862 1863 (void)ino; 1864 1865 pthread_mutex_lock(&lo->mutex); 1866 elem = lo_map_get(&lo->dirp_map, fi->fh); 1867 if (!elem) { 1868 pthread_mutex_unlock(&lo->mutex); 1869 fuse_reply_err(req, EBADF); 1870 return; 1871 } 1872 1873 d = elem->dirp; 1874 lo_map_remove(&lo->dirp_map, fi->fh); 1875 pthread_mutex_unlock(&lo->mutex); 1876 1877 lo_dirp_put(&d); /* paired with lo_opendir() */ 1878 1879 fuse_reply_err(req, 0); 1880} 1881 1882static void update_open_flags(int writeback, int allow_direct_io, 1883 struct fuse_file_info *fi) 1884{ 1885 /* 1886 * With writeback cache, kernel may send read requests even 1887 * when userspace opened write-only 1888 */ 1889 if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { 1890 fi->flags &= ~O_ACCMODE; 1891 fi->flags |= O_RDWR; 1892 } 1893 1894 /* 1895 * With writeback cache, O_APPEND is handled by the kernel. 1896 * This breaks atomicity (since the file may change in the 1897 * underlying filesystem, so that the kernel's idea of the 1898 * end of the file isn't accurate anymore). In this example, 1899 * we just accept that. A more rigorous filesystem may want 1900 * to return an error here 1901 */ 1902 if (writeback && (fi->flags & O_APPEND)) { 1903 fi->flags &= ~O_APPEND; 1904 } 1905 1906 /* 1907 * O_DIRECT in guest should not necessarily mean bypassing page 1908 * cache on host as well. Therefore, we discard it by default 1909 * ('-o no_allow_direct_io'). If somebody needs that behavior, 1910 * the '-o allow_direct_io' option should be set. 1911 */ 1912 if (!allow_direct_io) { 1913 fi->flags &= ~O_DIRECT; 1914 } 1915} 1916 1917/* 1918 * Open a regular file, set up an fd mapping, and fill out the struct 1919 * fuse_file_info for it. If existing_fd is not negative, use that fd instead 1920 * opening a new one. Takes ownership of existing_fd. 1921 * 1922 * Returns 0 on success or a positive errno. 1923 */ 1924static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, 1925 int existing_fd, struct fuse_file_info *fi) 1926{ 1927 ssize_t fh; 1928 int fd = existing_fd; 1929 int err; 1930 bool cap_fsetid_dropped = false; 1931 bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv; 1932 1933 update_open_flags(lo->writeback, lo->allow_direct_io, fi); 1934 1935 if (fd < 0) { 1936 if (kill_suidgid) { 1937 err = drop_effective_cap("FSETID", &cap_fsetid_dropped); 1938 if (err) { 1939 return err; 1940 } 1941 } 1942 1943 fd = lo_inode_open(lo, inode, fi->flags); 1944 1945 if (cap_fsetid_dropped) { 1946 if (gain_effective_cap("FSETID")) { 1947 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); 1948 } 1949 } 1950 if (fd < 0) { 1951 return -fd; 1952 } 1953 if (fi->flags & (O_TRUNC)) { 1954 int err = drop_security_capability(lo, fd); 1955 if (err) { 1956 close(fd); 1957 return err; 1958 } 1959 } 1960 } 1961 1962 pthread_mutex_lock(&lo->mutex); 1963 fh = lo_add_fd_mapping(lo, fd); 1964 pthread_mutex_unlock(&lo->mutex); 1965 if (fh == -1) { 1966 close(fd); 1967 return ENOMEM; 1968 } 1969 1970 fi->fh = fh; 1971 if (lo->cache == CACHE_NONE) { 1972 fi->direct_io = 1; 1973 } else if (lo->cache == CACHE_ALWAYS) { 1974 fi->keep_cache = 1; 1975 } 1976 return 0; 1977} 1978 1979static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, 1980 mode_t mode, struct fuse_file_info *fi) 1981{ 1982 int fd = -1; 1983 struct lo_data *lo = lo_data(req); 1984 struct lo_inode *parent_inode; 1985 struct lo_inode *inode = NULL; 1986 struct fuse_entry_param e; 1987 int err; 1988 struct lo_cred old = {}; 1989 1990 fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)" 1991 " kill_priv=%d\n", parent, name, fi->kill_priv); 1992 1993 if (!is_safe_path_component(name)) { 1994 fuse_reply_err(req, EINVAL); 1995 return; 1996 } 1997 1998 parent_inode = lo_inode(req, parent); 1999 if (!parent_inode) { 2000 fuse_reply_err(req, EBADF); 2001 return; 2002 } 2003 2004 err = lo_change_cred(req, &old, lo->change_umask); 2005 if (err) { 2006 goto out; 2007 } 2008 2009 update_open_flags(lo->writeback, lo->allow_direct_io, fi); 2010 2011 /* Try to create a new file but don't open existing files */ 2012 fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode); 2013 err = fd == -1 ? errno : 0; 2014 2015 lo_restore_cred(&old, lo->change_umask); 2016 2017 /* Ignore the error if file exists and O_EXCL was not given */ 2018 if (err && (err != EEXIST || (fi->flags & O_EXCL))) { 2019 goto out; 2020 } 2021 2022 err = lo_do_lookup(req, parent, name, &e, &inode); 2023 if (err) { 2024 goto out; 2025 } 2026 2027 err = lo_do_open(lo, inode, fd, fi); 2028 fd = -1; /* lo_do_open() takes ownership of fd */ 2029 if (err) { 2030 /* Undo lo_do_lookup() nlookup ref */ 2031 unref_inode_lolocked(lo, inode, 1); 2032 } 2033 2034out: 2035 lo_inode_put(lo, &inode); 2036 lo_inode_put(lo, &parent_inode); 2037 2038 if (err) { 2039 if (fd >= 0) { 2040 close(fd); 2041 } 2042 2043 fuse_reply_err(req, err); 2044 } else { 2045 fuse_reply_create(req, &e, fi); 2046 } 2047} 2048 2049/* Should be called with inode->plock_mutex held */ 2050static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, 2051 struct lo_inode *inode, 2052 uint64_t lock_owner, 2053 pid_t pid, int *err) 2054{ 2055 struct lo_inode_plock *plock; 2056 int fd; 2057 2058 plock = 2059 g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner)); 2060 2061 if (plock) { 2062 return plock; 2063 } 2064 2065 plock = malloc(sizeof(struct lo_inode_plock)); 2066 if (!plock) { 2067 *err = ENOMEM; 2068 return NULL; 2069 } 2070 2071 /* Open another instance of file which can be used for ofd locks. */ 2072 /* TODO: What if file is not writable? */ 2073 fd = lo_inode_open(lo, inode, O_RDWR); 2074 if (fd < 0) { 2075 *err = -fd; 2076 free(plock); 2077 return NULL; 2078 } 2079 2080 plock->lock_owner = lock_owner; 2081 plock->fd = fd; 2082 g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner), 2083 plock); 2084 return plock; 2085} 2086 2087static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, 2088 struct flock *lock) 2089{ 2090 struct lo_data *lo = lo_data(req); 2091 struct lo_inode *inode; 2092 struct lo_inode_plock *plock; 2093 int ret, saverr = 0; 2094 2095 fuse_log(FUSE_LOG_DEBUG, 2096 "lo_getlk(ino=%" PRIu64 ", flags=%d)" 2097 " owner=0x%" PRIx64 ", l_type=%d l_start=0x%" PRIx64 2098 " l_len=0x%" PRIx64 "\n", 2099 ino, fi->flags, fi->lock_owner, lock->l_type, 2100 (uint64_t)lock->l_start, (uint64_t)lock->l_len); 2101 2102 if (!lo->posix_lock) { 2103 fuse_reply_err(req, ENOSYS); 2104 return; 2105 } 2106 2107 inode = lo_inode(req, ino); 2108 if (!inode) { 2109 fuse_reply_err(req, EBADF); 2110 return; 2111 } 2112 2113 pthread_mutex_lock(&inode->plock_mutex); 2114 plock = 2115 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); 2116 if (!plock) { 2117 saverr = ret; 2118 goto out; 2119 } 2120 2121 ret = fcntl(plock->fd, F_OFD_GETLK, lock); 2122 if (ret == -1) { 2123 saverr = errno; 2124 } 2125 2126out: 2127 pthread_mutex_unlock(&inode->plock_mutex); 2128 lo_inode_put(lo, &inode); 2129 2130 if (saverr) { 2131 fuse_reply_err(req, saverr); 2132 } else { 2133 fuse_reply_lock(req, lock); 2134 } 2135} 2136 2137static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, 2138 struct flock *lock, int sleep) 2139{ 2140 struct lo_data *lo = lo_data(req); 2141 struct lo_inode *inode; 2142 struct lo_inode_plock *plock; 2143 int ret, saverr = 0; 2144 2145 fuse_log(FUSE_LOG_DEBUG, 2146 "lo_setlk(ino=%" PRIu64 ", flags=%d)" 2147 " cmd=%d pid=%d owner=0x%" PRIx64 " sleep=%d l_whence=%d" 2148 " l_start=0x%" PRIx64 " l_len=0x%" PRIx64 "\n", 2149 ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep, 2150 lock->l_whence, (uint64_t)lock->l_start, (uint64_t)lock->l_len); 2151 2152 if (!lo->posix_lock) { 2153 fuse_reply_err(req, ENOSYS); 2154 return; 2155 } 2156 2157 if (sleep) { 2158 fuse_reply_err(req, EOPNOTSUPP); 2159 return; 2160 } 2161 2162 inode = lo_inode(req, ino); 2163 if (!inode) { 2164 fuse_reply_err(req, EBADF); 2165 return; 2166 } 2167 2168 pthread_mutex_lock(&inode->plock_mutex); 2169 plock = 2170 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); 2171 2172 if (!plock) { 2173 saverr = ret; 2174 goto out; 2175 } 2176 2177 /* TODO: Is it alright to modify flock? */ 2178 lock->l_pid = 0; 2179 ret = fcntl(plock->fd, F_OFD_SETLK, lock); 2180 if (ret == -1) { 2181 saverr = errno; 2182 } 2183 2184out: 2185 pthread_mutex_unlock(&inode->plock_mutex); 2186 lo_inode_put(lo, &inode); 2187 2188 fuse_reply_err(req, saverr); 2189} 2190 2191static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, 2192 struct fuse_file_info *fi) 2193{ 2194 int res; 2195 struct lo_dirp *d; 2196 int fd; 2197 2198 (void)ino; 2199 2200 d = lo_dirp(req, fi); 2201 if (!d) { 2202 fuse_reply_err(req, EBADF); 2203 return; 2204 } 2205 2206 fd = dirfd(d->dp); 2207 if (datasync) { 2208 res = fdatasync(fd); 2209 } else { 2210 res = fsync(fd); 2211 } 2212 2213 lo_dirp_put(&d); 2214 2215 fuse_reply_err(req, res == -1 ? errno : 0); 2216} 2217 2218static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) 2219{ 2220 struct lo_data *lo = lo_data(req); 2221 struct lo_inode *inode = lo_inode(req, ino); 2222 int err; 2223 2224 fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)" 2225 "\n", ino, fi->flags, fi->kill_priv); 2226 2227 if (!inode) { 2228 fuse_reply_err(req, EBADF); 2229 return; 2230 } 2231 2232 err = lo_do_open(lo, inode, -1, fi); 2233 lo_inode_put(lo, &inode); 2234 if (err) { 2235 fuse_reply_err(req, err); 2236 } else { 2237 fuse_reply_open(req, fi); 2238 } 2239} 2240 2241static void lo_release(fuse_req_t req, fuse_ino_t ino, 2242 struct fuse_file_info *fi) 2243{ 2244 struct lo_data *lo = lo_data(req); 2245 struct lo_map_elem *elem; 2246 int fd = -1; 2247 2248 (void)ino; 2249 2250 pthread_mutex_lock(&lo->mutex); 2251 elem = lo_map_get(&lo->fd_map, fi->fh); 2252 if (elem) { 2253 fd = elem->fd; 2254 elem = NULL; 2255 lo_map_remove(&lo->fd_map, fi->fh); 2256 } 2257 pthread_mutex_unlock(&lo->mutex); 2258 2259 close(fd); 2260 fuse_reply_err(req, 0); 2261} 2262 2263static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) 2264{ 2265 int res; 2266 (void)ino; 2267 struct lo_inode *inode; 2268 struct lo_data *lo = lo_data(req); 2269 2270 inode = lo_inode(req, ino); 2271 if (!inode) { 2272 fuse_reply_err(req, EBADF); 2273 return; 2274 } 2275 2276 if (!S_ISREG(inode->filetype)) { 2277 lo_inode_put(lo, &inode); 2278 fuse_reply_err(req, EBADF); 2279 return; 2280 } 2281 2282 /* An fd is going away. Cleanup associated posix locks */ 2283 if (lo->posix_lock) { 2284 pthread_mutex_lock(&inode->plock_mutex); 2285 g_hash_table_remove(inode->posix_locks, 2286 GUINT_TO_POINTER(fi->lock_owner)); 2287 pthread_mutex_unlock(&inode->plock_mutex); 2288 } 2289 res = close(dup(lo_fi_fd(req, fi))); 2290 lo_inode_put(lo, &inode); 2291 fuse_reply_err(req, res == -1 ? errno : 0); 2292} 2293 2294static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, 2295 struct fuse_file_info *fi) 2296{ 2297 struct lo_inode *inode = lo_inode(req, ino); 2298 struct lo_data *lo = lo_data(req); 2299 int res; 2300 int fd; 2301 2302 fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino, 2303 (void *)fi); 2304 2305 if (!inode) { 2306 fuse_reply_err(req, EBADF); 2307 return; 2308 } 2309 2310 if (!fi) { 2311 fd = lo_inode_open(lo, inode, O_RDWR); 2312 if (fd < 0) { 2313 res = -fd; 2314 goto out; 2315 } 2316 } else { 2317 fd = lo_fi_fd(req, fi); 2318 } 2319 2320 if (datasync) { 2321 res = fdatasync(fd) == -1 ? errno : 0; 2322 } else { 2323 res = fsync(fd) == -1 ? errno : 0; 2324 } 2325 if (!fi) { 2326 close(fd); 2327 } 2328out: 2329 lo_inode_put(lo, &inode); 2330 fuse_reply_err(req, res); 2331} 2332 2333static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, 2334 struct fuse_file_info *fi) 2335{ 2336 struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); 2337 2338 fuse_log(FUSE_LOG_DEBUG, 2339 "lo_read(ino=%" PRIu64 ", size=%zd, " 2340 "off=%lu)\n", 2341 ino, size, (unsigned long)offset); 2342 2343 buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; 2344 buf.buf[0].fd = lo_fi_fd(req, fi); 2345 buf.buf[0].pos = offset; 2346 2347 fuse_reply_data(req, &buf); 2348} 2349 2350static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, 2351 struct fuse_bufvec *in_buf, off_t off, 2352 struct fuse_file_info *fi) 2353{ 2354 (void)ino; 2355 ssize_t res; 2356 struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); 2357 bool cap_fsetid_dropped = false; 2358 2359 out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; 2360 out_buf.buf[0].fd = lo_fi_fd(req, fi); 2361 out_buf.buf[0].pos = off; 2362 2363 fuse_log(FUSE_LOG_DEBUG, 2364 "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n", 2365 ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv); 2366 2367 res = drop_security_capability(lo_data(req), out_buf.buf[0].fd); 2368 if (res) { 2369 fuse_reply_err(req, res); 2370 return; 2371 } 2372 2373 /* 2374 * If kill_priv is set, drop CAP_FSETID which should lead to kernel 2375 * clearing setuid/setgid on file. Note, for WRITE, we need to do 2376 * this even if killpriv_v2 is not enabled. fuse direct write path 2377 * relies on this. 2378 */ 2379 if (fi->kill_priv) { 2380 res = drop_effective_cap("FSETID", &cap_fsetid_dropped); 2381 if (res != 0) { 2382 fuse_reply_err(req, res); 2383 return; 2384 } 2385 } 2386 2387 res = fuse_buf_copy(&out_buf, in_buf); 2388 if (res < 0) { 2389 fuse_reply_err(req, -res); 2390 } else { 2391 fuse_reply_write(req, (size_t)res); 2392 } 2393 2394 if (cap_fsetid_dropped) { 2395 res = gain_effective_cap("FSETID"); 2396 if (res) { 2397 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); 2398 } 2399 } 2400} 2401 2402static void lo_statfs(fuse_req_t req, fuse_ino_t ino) 2403{ 2404 int res; 2405 struct statvfs stbuf; 2406 2407 res = fstatvfs(lo_fd(req, ino), &stbuf); 2408 if (res == -1) { 2409 fuse_reply_err(req, errno); 2410 } else { 2411 fuse_reply_statfs(req, &stbuf); 2412 } 2413} 2414 2415static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, 2416 off_t length, struct fuse_file_info *fi) 2417{ 2418 int err = EOPNOTSUPP; 2419 (void)ino; 2420 2421#ifdef CONFIG_FALLOCATE 2422 err = fallocate(lo_fi_fd(req, fi), mode, offset, length); 2423 if (err < 0) { 2424 err = errno; 2425 } 2426 2427#elif defined(CONFIG_POSIX_FALLOCATE) 2428 if (mode) { 2429 fuse_reply_err(req, EOPNOTSUPP); 2430 return; 2431 } 2432 2433 err = posix_fallocate(lo_fi_fd(req, fi), offset, length); 2434#endif 2435 2436 fuse_reply_err(req, err); 2437} 2438 2439static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, 2440 int op) 2441{ 2442 int res; 2443 (void)ino; 2444 2445 res = flock(lo_fi_fd(req, fi), op); 2446 2447 fuse_reply_err(req, res == -1 ? errno : 0); 2448} 2449 2450/* types */ 2451/* 2452 * Exit; process attribute unmodified if matched. 2453 * An empty key applies to all. 2454 */ 2455#define XATTR_MAP_FLAG_OK (1 << 0) 2456/* 2457 * The attribute is unwanted; 2458 * EPERM on write, hidden on read. 2459 */ 2460#define XATTR_MAP_FLAG_BAD (1 << 1) 2461/* 2462 * For attr that start with 'key' prepend 'prepend' 2463 * 'key' may be empty to prepend for all attrs 2464 * key is defined from set/remove point of view. 2465 * Automatically reversed on read 2466 */ 2467#define XATTR_MAP_FLAG_PREFIX (1 << 2) 2468 2469/* scopes */ 2470/* Apply rule to get/set/remove */ 2471#define XATTR_MAP_FLAG_CLIENT (1 << 16) 2472/* Apply rule to list */ 2473#define XATTR_MAP_FLAG_SERVER (1 << 17) 2474/* Apply rule to all */ 2475#define XATTR_MAP_FLAG_ALL (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT) 2476 2477static void add_xattrmap_entry(struct lo_data *lo, 2478 const XattrMapEntry *new_entry) 2479{ 2480 XattrMapEntry *res = g_realloc_n(lo->xattr_map_list, 2481 lo->xattr_map_nentries + 1, 2482 sizeof(XattrMapEntry)); 2483 res[lo->xattr_map_nentries++] = *new_entry; 2484 2485 lo->xattr_map_list = res; 2486} 2487 2488static void free_xattrmap(struct lo_data *lo) 2489{ 2490 XattrMapEntry *map = lo->xattr_map_list; 2491 size_t i; 2492 2493 if (!map) { 2494 return; 2495 } 2496 2497 for (i = 0; i < lo->xattr_map_nentries; i++) { 2498 g_free(map[i].key); 2499 g_free(map[i].prepend); 2500 }; 2501 2502 g_free(map); 2503 lo->xattr_map_list = NULL; 2504 lo->xattr_map_nentries = -1; 2505} 2506 2507/* 2508 * Handle the 'map' type, which is sugar for a set of commands 2509 * for the common case of prefixing a subset or everything, 2510 * and allowing anything not prefixed through. 2511 * It must be the last entry in the stream, although there 2512 * can be other entries before it. 2513 * The form is: 2514 * :map:key:prefix: 2515 * 2516 * key maybe empty in which case all entries are prefixed. 2517 */ 2518static void parse_xattrmap_map(struct lo_data *lo, 2519 const char *rule, char sep) 2520{ 2521 const char *tmp; 2522 char *key; 2523 char *prefix; 2524 XattrMapEntry tmp_entry; 2525 2526 if (*rule != sep) { 2527 fuse_log(FUSE_LOG_ERR, 2528 "%s: Expecting '%c' after 'map' keyword, found '%c'\n", 2529 __func__, sep, *rule); 2530 exit(1); 2531 } 2532 2533 rule++; 2534 2535 /* At start of 'key' field */ 2536 tmp = strchr(rule, sep); 2537 if (!tmp) { 2538 fuse_log(FUSE_LOG_ERR, 2539 "%s: Missing '%c' at end of key field in map rule\n", 2540 __func__, sep); 2541 exit(1); 2542 } 2543 2544 key = g_strndup(rule, tmp - rule); 2545 rule = tmp + 1; 2546 2547 /* At start of prefix field */ 2548 tmp = strchr(rule, sep); 2549 if (!tmp) { 2550 fuse_log(FUSE_LOG_ERR, 2551 "%s: Missing '%c' at end of prefix field in map rule\n", 2552 __func__, sep); 2553 exit(1); 2554 } 2555 2556 prefix = g_strndup(rule, tmp - rule); 2557 rule = tmp + 1; 2558 2559 /* 2560 * This should be the end of the string, we don't allow 2561 * any more commands after 'map'. 2562 */ 2563 if (*rule) { 2564 fuse_log(FUSE_LOG_ERR, 2565 "%s: Expecting end of command after map, found '%c'\n", 2566 __func__, *rule); 2567 exit(1); 2568 } 2569 2570 /* 1st: Prefix matches/everything */ 2571 tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL; 2572 tmp_entry.key = g_strdup(key); 2573 tmp_entry.prepend = g_strdup(prefix); 2574 add_xattrmap_entry(lo, &tmp_entry); 2575 2576 if (!*key) { 2577 /* Prefix all case */ 2578 2579 /* 2nd: Hide any non-prefixed entries on the host */ 2580 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL; 2581 tmp_entry.key = g_strdup(""); 2582 tmp_entry.prepend = g_strdup(""); 2583 add_xattrmap_entry(lo, &tmp_entry); 2584 } else { 2585 /* Prefix matching case */ 2586 2587 /* 2nd: Hide non-prefixed but matching entries on the host */ 2588 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER; 2589 tmp_entry.key = g_strdup(""); /* Not used */ 2590 tmp_entry.prepend = g_strdup(key); 2591 add_xattrmap_entry(lo, &tmp_entry); 2592 2593 /* 3rd: Stop the client accessing prefixed attributes directly */ 2594 tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT; 2595 tmp_entry.key = g_strdup(prefix); 2596 tmp_entry.prepend = g_strdup(""); /* Not used */ 2597 add_xattrmap_entry(lo, &tmp_entry); 2598 2599 /* 4th: Everything else is OK */ 2600 tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL; 2601 tmp_entry.key = g_strdup(""); 2602 tmp_entry.prepend = g_strdup(""); 2603 add_xattrmap_entry(lo, &tmp_entry); 2604 } 2605 2606 g_free(key); 2607 g_free(prefix); 2608} 2609 2610static void parse_xattrmap(struct lo_data *lo) 2611{ 2612 const char *map = lo->xattrmap; 2613 const char *tmp; 2614 int ret; 2615 2616 lo->xattr_map_nentries = 0; 2617 while (*map) { 2618 XattrMapEntry tmp_entry; 2619 char sep; 2620 2621 if (isspace(*map)) { 2622 map++; 2623 continue; 2624 } 2625 /* The separator is the first non-space of the rule */ 2626 sep = *map++; 2627 if (!sep) { 2628 break; 2629 } 2630 2631 tmp_entry.flags = 0; 2632 /* Start of 'type' */ 2633 if (strstart(map, "prefix", &map)) { 2634 tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX; 2635 } else if (strstart(map, "ok", &map)) { 2636 tmp_entry.flags |= XATTR_MAP_FLAG_OK; 2637 } else if (strstart(map, "bad", &map)) { 2638 tmp_entry.flags |= XATTR_MAP_FLAG_BAD; 2639 } else if (strstart(map, "map", &map)) { 2640 /* 2641 * map is sugar that adds a number of rules, and must be 2642 * the last entry. 2643 */ 2644 parse_xattrmap_map(lo, map, sep); 2645 break; 2646 } else { 2647 fuse_log(FUSE_LOG_ERR, 2648 "%s: Unexpected type;" 2649 "Expecting 'prefix', 'ok', 'bad' or 'map' in rule %zu\n", 2650 __func__, lo->xattr_map_nentries); 2651 exit(1); 2652 } 2653 2654 if (*map++ != sep) { 2655 fuse_log(FUSE_LOG_ERR, 2656 "%s: Missing '%c' at end of type field of rule %zu\n", 2657 __func__, sep, lo->xattr_map_nentries); 2658 exit(1); 2659 } 2660 2661 /* Start of 'scope' */ 2662 if (strstart(map, "client", &map)) { 2663 tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT; 2664 } else if (strstart(map, "server", &map)) { 2665 tmp_entry.flags |= XATTR_MAP_FLAG_SERVER; 2666 } else if (strstart(map, "all", &map)) { 2667 tmp_entry.flags |= XATTR_MAP_FLAG_ALL; 2668 } else { 2669 fuse_log(FUSE_LOG_ERR, 2670 "%s: Unexpected scope;" 2671 " Expecting 'client', 'server', or 'all', in rule %zu\n", 2672 __func__, lo->xattr_map_nentries); 2673 exit(1); 2674 } 2675 2676 if (*map++ != sep) { 2677 fuse_log(FUSE_LOG_ERR, 2678 "%s: Expecting '%c' found '%c'" 2679 " after scope in rule %zu\n", 2680 __func__, sep, *map, lo->xattr_map_nentries); 2681 exit(1); 2682 } 2683 2684 /* At start of 'key' field */ 2685 tmp = strchr(map, sep); 2686 if (!tmp) { 2687 fuse_log(FUSE_LOG_ERR, 2688 "%s: Missing '%c' at end of key field of rule %zu", 2689 __func__, sep, lo->xattr_map_nentries); 2690 exit(1); 2691 } 2692 tmp_entry.key = g_strndup(map, tmp - map); 2693 map = tmp + 1; 2694 2695 /* At start of 'prepend' field */ 2696 tmp = strchr(map, sep); 2697 if (!tmp) { 2698 fuse_log(FUSE_LOG_ERR, 2699 "%s: Missing '%c' at end of prepend field of rule %zu", 2700 __func__, sep, lo->xattr_map_nentries); 2701 exit(1); 2702 } 2703 tmp_entry.prepend = g_strndup(map, tmp - map); 2704 map = tmp + 1; 2705 2706 add_xattrmap_entry(lo, &tmp_entry); 2707 /* End of rule - go around again for another rule */ 2708 } 2709 2710 if (!lo->xattr_map_nentries) { 2711 fuse_log(FUSE_LOG_ERR, "Empty xattr map\n"); 2712 exit(1); 2713 } 2714 2715 ret = xattr_map_client(lo, "security.capability", 2716 &lo->xattr_security_capability); 2717 if (ret) { 2718 fuse_log(FUSE_LOG_ERR, "Failed to map security.capability: %s\n", 2719 strerror(ret)); 2720 exit(1); 2721 } 2722 if (!lo->xattr_security_capability || 2723 !strcmp(lo->xattr_security_capability, "security.capability")) { 2724 /* 1-1 mapping, don't need to do anything */ 2725 free(lo->xattr_security_capability); 2726 lo->xattr_security_capability = NULL; 2727 } 2728} 2729 2730/* 2731 * For use with getxattr/setxattr/removexattr, where the client 2732 * gives us a name and we may need to choose a different one. 2733 * Allocates a buffer for the result placing it in *out_name. 2734 * If there's no change then *out_name is not set. 2735 * Returns 0 on success 2736 * Can return -EPERM to indicate we block a given attribute 2737 * (in which case out_name is not allocated) 2738 * Can return -ENOMEM to indicate out_name couldn't be allocated. 2739 */ 2740static int xattr_map_client(const struct lo_data *lo, const char *client_name, 2741 char **out_name) 2742{ 2743 size_t i; 2744 for (i = 0; i < lo->xattr_map_nentries; i++) { 2745 const XattrMapEntry *cur_entry = lo->xattr_map_list + i; 2746 2747 if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) && 2748 (strstart(client_name, cur_entry->key, NULL))) { 2749 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) { 2750 return -EPERM; 2751 } 2752 if (cur_entry->flags & XATTR_MAP_FLAG_OK) { 2753 /* Unmodified name */ 2754 return 0; 2755 } 2756 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) { 2757 *out_name = g_try_malloc(strlen(client_name) + 2758 strlen(cur_entry->prepend) + 1); 2759 if (!*out_name) { 2760 return -ENOMEM; 2761 } 2762 sprintf(*out_name, "%s%s", cur_entry->prepend, client_name); 2763 return 0; 2764 } 2765 } 2766 } 2767 2768 return -EPERM; 2769} 2770 2771/* 2772 * For use with listxattr where the server fs gives us a name and we may need 2773 * to sanitize this for the client. 2774 * Returns a pointer to the result in *out_name 2775 * This is always the original string or the current string with some prefix 2776 * removed; no reallocation is done. 2777 * Returns 0 on success 2778 * Can return -ENODATA to indicate the name should be dropped from the list. 2779 */ 2780static int xattr_map_server(const struct lo_data *lo, const char *server_name, 2781 const char **out_name) 2782{ 2783 size_t i; 2784 const char *end; 2785 2786 for (i = 0; i < lo->xattr_map_nentries; i++) { 2787 const XattrMapEntry *cur_entry = lo->xattr_map_list + i; 2788 2789 if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) && 2790 (strstart(server_name, cur_entry->prepend, &end))) { 2791 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) { 2792 return -ENODATA; 2793 } 2794 if (cur_entry->flags & XATTR_MAP_FLAG_OK) { 2795 *out_name = server_name; 2796 return 0; 2797 } 2798 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) { 2799 /* Remove prefix */ 2800 *out_name = end; 2801 return 0; 2802 } 2803 } 2804 } 2805 2806 return -ENODATA; 2807} 2808 2809#define FCHDIR_NOFAIL(fd) do { \ 2810 int fchdir_res = fchdir(fd); \ 2811 assert(fchdir_res == 0); \ 2812 } while (0) 2813 2814static bool block_xattr(struct lo_data *lo, const char *name) 2815{ 2816 /* 2817 * If user explicitly enabled posix_acl or did not provide any option, 2818 * do not block acl. Otherwise block system.posix_acl_access and 2819 * system.posix_acl_default xattrs. 2820 */ 2821 if (lo->user_posix_acl) { 2822 return false; 2823 } 2824 if (!strcmp(name, "system.posix_acl_access") || 2825 !strcmp(name, "system.posix_acl_default")) 2826 return true; 2827 2828 return false; 2829} 2830 2831/* 2832 * Returns number of bytes in xattr_list after filtering on success. This 2833 * could be zero as well if nothing is left after filtering. 2834 * 2835 * Returns negative error code on failure. 2836 * xattr_list is modified in place. 2837 */ 2838static int remove_blocked_xattrs(struct lo_data *lo, char *xattr_list, 2839 unsigned in_size) 2840{ 2841 size_t out_index, in_index; 2842 2843 /* 2844 * As of now we only filter out acl xattrs. If acls are enabled or 2845 * they have not been explicitly disabled, there is nothing to 2846 * filter. 2847 */ 2848 if (lo->user_posix_acl) { 2849 return in_size; 2850 } 2851 2852 out_index = 0; 2853 in_index = 0; 2854 while (in_index < in_size) { 2855 char *in_ptr = xattr_list + in_index; 2856 2857 /* Length of current attribute name */ 2858 size_t in_len = strlen(xattr_list + in_index) + 1; 2859 2860 if (!block_xattr(lo, in_ptr)) { 2861 if (in_index != out_index) { 2862 memmove(xattr_list + out_index, xattr_list + in_index, in_len); 2863 } 2864 out_index += in_len; 2865 } 2866 in_index += in_len; 2867 } 2868 return out_index; 2869} 2870 2871static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, 2872 size_t size) 2873{ 2874 struct lo_data *lo = lo_data(req); 2875 g_autofree char *value = NULL; 2876 char procname[64]; 2877 const char *name; 2878 char *mapped_name; 2879 struct lo_inode *inode; 2880 ssize_t ret; 2881 int saverr; 2882 int fd = -1; 2883 2884 if (block_xattr(lo, in_name)) { 2885 fuse_reply_err(req, EOPNOTSUPP); 2886 return; 2887 } 2888 2889 mapped_name = NULL; 2890 name = in_name; 2891 if (lo->xattrmap) { 2892 ret = xattr_map_client(lo, in_name, &mapped_name); 2893 if (ret < 0) { 2894 if (ret == -EPERM) { 2895 ret = -ENODATA; 2896 } 2897 fuse_reply_err(req, -ret); 2898 return; 2899 } 2900 if (mapped_name) { 2901 name = mapped_name; 2902 } 2903 } 2904 2905 inode = lo_inode(req, ino); 2906 if (!inode) { 2907 fuse_reply_err(req, EBADF); 2908 g_free(mapped_name); 2909 return; 2910 } 2911 2912 saverr = ENOSYS; 2913 if (!lo_data(req)->xattr) { 2914 goto out; 2915 } 2916 2917 fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", 2918 ino, name, size); 2919 2920 if (size) { 2921 value = g_try_malloc(size); 2922 if (!value) { 2923 goto out_err; 2924 } 2925 } 2926 2927 sprintf(procname, "%i", inode->fd); 2928 /* 2929 * It is not safe to open() non-regular/non-dir files in file server 2930 * unless O_PATH is used, so use that method for regular files/dir 2931 * only (as it seems giving less performance overhead). 2932 * Otherwise, call fchdir() to avoid open(). 2933 */ 2934 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 2935 fd = openat(lo->proc_self_fd, procname, O_RDONLY); 2936 if (fd < 0) { 2937 goto out_err; 2938 } 2939 ret = fgetxattr(fd, name, value, size); 2940 saverr = ret == -1 ? errno : 0; 2941 } else { 2942 /* fchdir should not fail here */ 2943 FCHDIR_NOFAIL(lo->proc_self_fd); 2944 ret = getxattr(procname, name, value, size); 2945 saverr = ret == -1 ? errno : 0; 2946 FCHDIR_NOFAIL(lo->root.fd); 2947 } 2948 2949 if (ret == -1) { 2950 goto out; 2951 } 2952 if (size) { 2953 saverr = 0; 2954 if (ret == 0) { 2955 goto out; 2956 } 2957 fuse_reply_buf(req, value, ret); 2958 } else { 2959 fuse_reply_xattr(req, ret); 2960 } 2961out_free: 2962 if (fd >= 0) { 2963 close(fd); 2964 } 2965 2966 lo_inode_put(lo, &inode); 2967 return; 2968 2969out_err: 2970 saverr = errno; 2971out: 2972 fuse_reply_err(req, saverr); 2973 g_free(mapped_name); 2974 goto out_free; 2975} 2976 2977static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) 2978{ 2979 struct lo_data *lo = lo_data(req); 2980 g_autofree char *value = NULL; 2981 char procname[64]; 2982 struct lo_inode *inode; 2983 ssize_t ret; 2984 int saverr; 2985 int fd = -1; 2986 2987 inode = lo_inode(req, ino); 2988 if (!inode) { 2989 fuse_reply_err(req, EBADF); 2990 return; 2991 } 2992 2993 saverr = ENOSYS; 2994 if (!lo_data(req)->xattr) { 2995 goto out; 2996 } 2997 2998 fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino, 2999 size); 3000 3001 if (size) { 3002 value = g_try_malloc(size); 3003 if (!value) { 3004 goto out_err; 3005 } 3006 } 3007 3008 sprintf(procname, "%i", inode->fd); 3009 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 3010 fd = openat(lo->proc_self_fd, procname, O_RDONLY); 3011 if (fd < 0) { 3012 goto out_err; 3013 } 3014 ret = flistxattr(fd, value, size); 3015 saverr = ret == -1 ? errno : 0; 3016 } else { 3017 /* fchdir should not fail here */ 3018 FCHDIR_NOFAIL(lo->proc_self_fd); 3019 ret = listxattr(procname, value, size); 3020 saverr = ret == -1 ? errno : 0; 3021 FCHDIR_NOFAIL(lo->root.fd); 3022 } 3023 3024 if (ret == -1) { 3025 goto out; 3026 } 3027 if (size) { 3028 saverr = 0; 3029 if (ret == 0) { 3030 goto out; 3031 } 3032 3033 if (lo->xattr_map_list) { 3034 /* 3035 * Map the names back, some attributes might be dropped, 3036 * some shortened, but not increased, so we shouldn't 3037 * run out of room. 3038 */ 3039 size_t out_index, in_index; 3040 out_index = 0; 3041 in_index = 0; 3042 while (in_index < ret) { 3043 const char *map_out; 3044 char *in_ptr = value + in_index; 3045 /* Length of current attribute name */ 3046 size_t in_len = strlen(value + in_index) + 1; 3047 3048 int mapret = xattr_map_server(lo, in_ptr, &map_out); 3049 if (mapret != -ENODATA && mapret != 0) { 3050 /* Shouldn't happen */ 3051 saverr = -mapret; 3052 goto out; 3053 } 3054 if (mapret == 0) { 3055 /* Either unchanged, or truncated */ 3056 size_t out_len; 3057 if (map_out != in_ptr) { 3058 /* +1 copies the NIL */ 3059 out_len = strlen(map_out) + 1; 3060 } else { 3061 /* No change */ 3062 out_len = in_len; 3063 } 3064 /* 3065 * Move result along, may still be needed for an unchanged 3066 * entry if a previous entry was changed. 3067 */ 3068 memmove(value + out_index, map_out, out_len); 3069 3070 out_index += out_len; 3071 } 3072 in_index += in_len; 3073 } 3074 ret = out_index; 3075 if (ret == 0) { 3076 goto out; 3077 } 3078 } 3079 3080 ret = remove_blocked_xattrs(lo, value, ret); 3081 if (ret <= 0) { 3082 saverr = -ret; 3083 goto out; 3084 } 3085 fuse_reply_buf(req, value, ret); 3086 } else { 3087 /* 3088 * xattrmap only ever shortens the result, 3089 * so we don't need to do anything clever with the 3090 * allocation length here. 3091 */ 3092 fuse_reply_xattr(req, ret); 3093 } 3094out_free: 3095 if (fd >= 0) { 3096 close(fd); 3097 } 3098 3099 lo_inode_put(lo, &inode); 3100 return; 3101 3102out_err: 3103 saverr = errno; 3104out: 3105 fuse_reply_err(req, saverr); 3106 goto out_free; 3107} 3108 3109static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, 3110 const char *value, size_t size, int flags, 3111 uint32_t extra_flags) 3112{ 3113 char procname[64]; 3114 const char *name; 3115 char *mapped_name; 3116 struct lo_data *lo = lo_data(req); 3117 struct lo_inode *inode; 3118 ssize_t ret; 3119 int saverr; 3120 int fd = -1; 3121 bool switched_creds = false; 3122 bool cap_fsetid_dropped = false; 3123 struct lo_cred old = {}; 3124 3125 if (block_xattr(lo, in_name)) { 3126 fuse_reply_err(req, EOPNOTSUPP); 3127 return; 3128 } 3129 3130 mapped_name = NULL; 3131 name = in_name; 3132 if (lo->xattrmap) { 3133 ret = xattr_map_client(lo, in_name, &mapped_name); 3134 if (ret < 0) { 3135 fuse_reply_err(req, -ret); 3136 return; 3137 } 3138 if (mapped_name) { 3139 name = mapped_name; 3140 } 3141 } 3142 3143 inode = lo_inode(req, ino); 3144 if (!inode) { 3145 fuse_reply_err(req, EBADF); 3146 g_free(mapped_name); 3147 return; 3148 } 3149 3150 saverr = ENOSYS; 3151 if (!lo_data(req)->xattr) { 3152 goto out; 3153 } 3154 3155 fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 3156 ", name=%s value=%s size=%zd)\n", ino, name, value, size); 3157 3158 sprintf(procname, "%i", inode->fd); 3159 /* 3160 * If we are setting posix access acl and if SGID needs to be 3161 * cleared, then switch to caller's gid and drop CAP_FSETID 3162 * and that should make sure host kernel clears SGID. 3163 * 3164 * This probably will not work when we support idmapped mounts. 3165 * In that case we will need to find a non-root gid and switch 3166 * to it. (Instead of gid in request). Fix it when we support 3167 * idmapped mounts. 3168 */ 3169 if (lo->posix_acl && !strcmp(name, "system.posix_acl_access") 3170 && (extra_flags & FUSE_SETXATTR_ACL_KILL_SGID)) { 3171 ret = lo_drop_cap_change_cred(req, &old, false, "FSETID", 3172 &cap_fsetid_dropped); 3173 if (ret) { 3174 saverr = ret; 3175 goto out; 3176 } 3177 switched_creds = true; 3178 } 3179 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 3180 fd = openat(lo->proc_self_fd, procname, O_RDONLY); 3181 if (fd < 0) { 3182 saverr = errno; 3183 goto out; 3184 } 3185 ret = fsetxattr(fd, name, value, size, flags); 3186 saverr = ret == -1 ? errno : 0; 3187 } else { 3188 /* fchdir should not fail here */ 3189 FCHDIR_NOFAIL(lo->proc_self_fd); 3190 ret = setxattr(procname, name, value, size, flags); 3191 saverr = ret == -1 ? errno : 0; 3192 FCHDIR_NOFAIL(lo->root.fd); 3193 } 3194 if (switched_creds) { 3195 if (cap_fsetid_dropped) 3196 lo_restore_cred_gain_cap(&old, false, "FSETID"); 3197 else 3198 lo_restore_cred(&old, false); 3199 } 3200 3201out: 3202 if (fd >= 0) { 3203 close(fd); 3204 } 3205 3206 lo_inode_put(lo, &inode); 3207 g_free(mapped_name); 3208 fuse_reply_err(req, saverr); 3209} 3210 3211static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name) 3212{ 3213 char procname[64]; 3214 const char *name; 3215 char *mapped_name; 3216 struct lo_data *lo = lo_data(req); 3217 struct lo_inode *inode; 3218 ssize_t ret; 3219 int saverr; 3220 int fd = -1; 3221 3222 if (block_xattr(lo, in_name)) { 3223 fuse_reply_err(req, EOPNOTSUPP); 3224 return; 3225 } 3226 3227 mapped_name = NULL; 3228 name = in_name; 3229 if (lo->xattrmap) { 3230 ret = xattr_map_client(lo, in_name, &mapped_name); 3231 if (ret < 0) { 3232 fuse_reply_err(req, -ret); 3233 return; 3234 } 3235 if (mapped_name) { 3236 name = mapped_name; 3237 } 3238 } 3239 3240 inode = lo_inode(req, ino); 3241 if (!inode) { 3242 fuse_reply_err(req, EBADF); 3243 g_free(mapped_name); 3244 return; 3245 } 3246 3247 saverr = ENOSYS; 3248 if (!lo_data(req)->xattr) { 3249 goto out; 3250 } 3251 3252 fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino, 3253 name); 3254 3255 sprintf(procname, "%i", inode->fd); 3256 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { 3257 fd = openat(lo->proc_self_fd, procname, O_RDONLY); 3258 if (fd < 0) { 3259 saverr = errno; 3260 goto out; 3261 } 3262 ret = fremovexattr(fd, name); 3263 saverr = ret == -1 ? errno : 0; 3264 } else { 3265 /* fchdir should not fail here */ 3266 FCHDIR_NOFAIL(lo->proc_self_fd); 3267 ret = removexattr(procname, name); 3268 saverr = ret == -1 ? errno : 0; 3269 FCHDIR_NOFAIL(lo->root.fd); 3270 } 3271 3272out: 3273 if (fd >= 0) { 3274 close(fd); 3275 } 3276 3277 lo_inode_put(lo, &inode); 3278 g_free(mapped_name); 3279 fuse_reply_err(req, saverr); 3280} 3281 3282#ifdef HAVE_COPY_FILE_RANGE 3283static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, 3284 struct fuse_file_info *fi_in, fuse_ino_t ino_out, 3285 off_t off_out, struct fuse_file_info *fi_out, 3286 size_t len, int flags) 3287{ 3288 int in_fd, out_fd; 3289 ssize_t res; 3290 3291 in_fd = lo_fi_fd(req, fi_in); 3292 out_fd = lo_fi_fd(req, fi_out); 3293 3294 fuse_log(FUSE_LOG_DEBUG, 3295 "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, " 3296 "off=%ju, ino=%" PRIu64 "/fd=%d, " 3297 "off=%ju, size=%zd, flags=0x%x)\n", 3298 ino_in, in_fd, (intmax_t)off_in, 3299 ino_out, out_fd, (intmax_t)off_out, len, flags); 3300 3301 res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags); 3302 if (res < 0) { 3303 fuse_reply_err(req, errno); 3304 } else { 3305 fuse_reply_write(req, res); 3306 } 3307} 3308#endif 3309 3310static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, 3311 struct fuse_file_info *fi) 3312{ 3313 off_t res; 3314 3315 (void)ino; 3316 res = lseek(lo_fi_fd(req, fi), off, whence); 3317 if (res != -1) { 3318 fuse_reply_lseek(req, res); 3319 } else { 3320 fuse_reply_err(req, errno); 3321 } 3322} 3323 3324static void lo_destroy(void *userdata) 3325{ 3326 struct lo_data *lo = (struct lo_data *)userdata; 3327 3328 pthread_mutex_lock(&lo->mutex); 3329 while (true) { 3330 GHashTableIter iter; 3331 gpointer key, value; 3332 3333 g_hash_table_iter_init(&iter, lo->inodes); 3334 if (!g_hash_table_iter_next(&iter, &key, &value)) { 3335 break; 3336 } 3337 3338 struct lo_inode *inode = value; 3339 unref_inode(lo, inode, inode->nlookup); 3340 } 3341 pthread_mutex_unlock(&lo->mutex); 3342} 3343 3344static struct fuse_lowlevel_ops lo_oper = { 3345 .init = lo_init, 3346 .lookup = lo_lookup, 3347 .mkdir = lo_mkdir, 3348 .mknod = lo_mknod, 3349 .symlink = lo_symlink, 3350 .link = lo_link, 3351 .unlink = lo_unlink, 3352 .rmdir = lo_rmdir, 3353 .rename = lo_rename, 3354 .forget = lo_forget, 3355 .forget_multi = lo_forget_multi, 3356 .getattr = lo_getattr, 3357 .setattr = lo_setattr, 3358 .readlink = lo_readlink, 3359 .opendir = lo_opendir, 3360 .readdir = lo_readdir, 3361 .readdirplus = lo_readdirplus, 3362 .releasedir = lo_releasedir, 3363 .fsyncdir = lo_fsyncdir, 3364 .create = lo_create, 3365 .getlk = lo_getlk, 3366 .setlk = lo_setlk, 3367 .open = lo_open, 3368 .release = lo_release, 3369 .flush = lo_flush, 3370 .fsync = lo_fsync, 3371 .read = lo_read, 3372 .write_buf = lo_write_buf, 3373 .statfs = lo_statfs, 3374 .fallocate = lo_fallocate, 3375 .flock = lo_flock, 3376 .getxattr = lo_getxattr, 3377 .listxattr = lo_listxattr, 3378 .setxattr = lo_setxattr, 3379 .removexattr = lo_removexattr, 3380#ifdef HAVE_COPY_FILE_RANGE 3381 .copy_file_range = lo_copy_file_range, 3382#endif 3383 .lseek = lo_lseek, 3384 .destroy = lo_destroy, 3385}; 3386 3387/* Print vhost-user.json backend program capabilities */ 3388static void print_capabilities(void) 3389{ 3390 printf("{\n"); 3391 printf(" \"type\": \"fs\"\n"); 3392 printf("}\n"); 3393} 3394 3395/* 3396 * Drop all Linux capabilities because the wait parent process only needs to 3397 * sit in waitpid(2) and terminate. 3398 */ 3399static void setup_wait_parent_capabilities(void) 3400{ 3401 capng_setpid(syscall(SYS_gettid)); 3402 capng_clear(CAPNG_SELECT_BOTH); 3403 capng_apply(CAPNG_SELECT_BOTH); 3404} 3405 3406/* 3407 * Move to a new mount, net, and pid namespaces to isolate this process. 3408 */ 3409static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) 3410{ 3411 pid_t child; 3412 3413 /* 3414 * Create a new pid namespace for *child* processes. We'll have to 3415 * fork in order to enter the new pid namespace. A new mount namespace 3416 * is also needed so that we can remount /proc for the new pid 3417 * namespace. 3418 * 3419 * Our UNIX domain sockets have been created. Now we can move to 3420 * an empty network namespace to prevent TCP/IP and other network 3421 * activity in case this process is compromised. 3422 */ 3423 if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) { 3424 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n"); 3425 exit(1); 3426 } 3427 3428 child = fork(); 3429 if (child < 0) { 3430 fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n"); 3431 exit(1); 3432 } 3433 if (child > 0) { 3434 pid_t waited; 3435 int wstatus; 3436 3437 setup_wait_parent_capabilities(); 3438 3439 /* The parent waits for the child */ 3440 do { 3441 waited = waitpid(child, &wstatus, 0); 3442 } while (waited < 0 && errno == EINTR && !se->exited); 3443 3444 /* We were terminated by a signal, see fuse_signals.c */ 3445 if (se->exited) { 3446 exit(0); 3447 } 3448 3449 if (WIFEXITED(wstatus)) { 3450 exit(WEXITSTATUS(wstatus)); 3451 } 3452 3453 exit(1); 3454 } 3455 3456 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */ 3457 prctl(PR_SET_PDEATHSIG, SIGTERM); 3458 3459 /* 3460 * If the mounts have shared propagation then we want to opt out so our 3461 * mount changes don't affect the parent mount namespace. 3462 */ 3463 if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) { 3464 fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n"); 3465 exit(1); 3466 } 3467 3468 /* The child must remount /proc to use the new pid namespace */ 3469 if (mount("proc", "/proc", "proc", 3470 MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) { 3471 fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n"); 3472 exit(1); 3473 } 3474 3475 /* 3476 * We only need /proc/self/fd. Prevent ".." from accessing parent 3477 * directories of /proc/self/fd by bind-mounting it over /proc. Since / was 3478 * previously remounted with MS_REC | MS_SLAVE this mount change only 3479 * affects our process. 3480 */ 3481 if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) { 3482 fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n"); 3483 exit(1); 3484 } 3485 3486 /* Get the /proc (actually /proc/self/fd, see above) file descriptor */ 3487 lo->proc_self_fd = open("/proc", O_PATH); 3488 if (lo->proc_self_fd == -1) { 3489 fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n"); 3490 exit(1); 3491 } 3492} 3493 3494/* 3495 * Capture the capability state, we'll need to restore this for individual 3496 * threads later; see load_capng. 3497 */ 3498static void setup_capng(void) 3499{ 3500 /* Note this accesses /proc so has to happen before the sandbox */ 3501 if (capng_get_caps_process()) { 3502 fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n"); 3503 exit(1); 3504 } 3505 pthread_mutex_init(&cap.mutex, NULL); 3506 pthread_mutex_lock(&cap.mutex); 3507 cap.saved = capng_save_state(); 3508 if (!cap.saved) { 3509 fuse_log(FUSE_LOG_ERR, "capng_save_state\n"); 3510 exit(1); 3511 } 3512 pthread_mutex_unlock(&cap.mutex); 3513} 3514 3515static void cleanup_capng(void) 3516{ 3517 free(cap.saved); 3518 cap.saved = NULL; 3519 pthread_mutex_destroy(&cap.mutex); 3520} 3521 3522 3523/* 3524 * Make the source directory our root so symlinks cannot escape and no other 3525 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called. 3526 */ 3527static void setup_mounts(const char *source) 3528{ 3529 int oldroot; 3530 int newroot; 3531 3532 if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) { 3533 fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); 3534 exit(1); 3535 } 3536 3537 /* This magic is based on lxc's lxc_pivot_root() */ 3538 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); 3539 if (oldroot < 0) { 3540 fuse_log(FUSE_LOG_ERR, "open(/): %m\n"); 3541 exit(1); 3542 } 3543 3544 newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC); 3545 if (newroot < 0) { 3546 fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source); 3547 exit(1); 3548 } 3549 3550 if (fchdir(newroot) < 0) { 3551 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); 3552 exit(1); 3553 } 3554 3555 if (syscall(__NR_pivot_root, ".", ".") < 0) { 3556 fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n"); 3557 exit(1); 3558 } 3559 3560 if (fchdir(oldroot) < 0) { 3561 fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n"); 3562 exit(1); 3563 } 3564 3565 if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) { 3566 fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n"); 3567 exit(1); 3568 } 3569 3570 if (umount2(".", MNT_DETACH) < 0) { 3571 fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n"); 3572 exit(1); 3573 } 3574 3575 if (fchdir(newroot) < 0) { 3576 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); 3577 exit(1); 3578 } 3579 3580 close(newroot); 3581 close(oldroot); 3582} 3583 3584/* 3585 * Only keep capabilities in allowlist that are needed for file system operation 3586 * The (possibly NULL) modcaps_in string passed in is free'd before exit. 3587 */ 3588static void setup_capabilities(char *modcaps_in) 3589{ 3590 char *modcaps = modcaps_in; 3591 pthread_mutex_lock(&cap.mutex); 3592 capng_restore_state(&cap.saved); 3593 3594 /* 3595 * Add to allowlist file system-related capabilities that are needed for a 3596 * file server to act like root. Drop everything else like networking and 3597 * sysadmin capabilities. 3598 * 3599 * Exclusions: 3600 * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl 3601 * and we don't support that. 3602 * 2. CAP_MAC_OVERRIDE is not included because it only seems to be 3603 * used by the Smack LSM. Omit it until there is demand for it. 3604 */ 3605 capng_setpid(syscall(SYS_gettid)); 3606 capng_clear(CAPNG_SELECT_BOTH); 3607 if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE, 3608 CAP_CHOWN, 3609 CAP_DAC_OVERRIDE, 3610 CAP_FOWNER, 3611 CAP_FSETID, 3612 CAP_SETGID, 3613 CAP_SETUID, 3614 CAP_MKNOD, 3615 CAP_SETFCAP, 3616 -1)) { 3617 fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__); 3618 exit(1); 3619 } 3620 3621 /* 3622 * The modcaps option is a colon separated list of caps, 3623 * each preceded by either + or -. 3624 */ 3625 while (modcaps) { 3626 capng_act_t action; 3627 int cap; 3628 3629 char *next = strchr(modcaps, ':'); 3630 if (next) { 3631 *next = '\0'; 3632 next++; 3633 } 3634 3635 switch (modcaps[0]) { 3636 case '+': 3637 action = CAPNG_ADD; 3638 break; 3639 3640 case '-': 3641 action = CAPNG_DROP; 3642 break; 3643 3644 default: 3645 fuse_log(FUSE_LOG_ERR, 3646 "%s: Expecting '+'/'-' in modcaps but found '%c'\n", 3647 __func__, modcaps[0]); 3648 exit(1); 3649 } 3650 cap = capng_name_to_capability(modcaps + 1); 3651 if (cap < 0) { 3652 fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__, 3653 modcaps); 3654 exit(1); 3655 } 3656 if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) { 3657 fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n", 3658 __func__, modcaps); 3659 exit(1); 3660 } 3661 3662 modcaps = next; 3663 } 3664 g_free(modcaps_in); 3665 3666 if (capng_apply(CAPNG_SELECT_BOTH)) { 3667 fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__); 3668 exit(1); 3669 } 3670 3671 cap.saved = capng_save_state(); 3672 if (!cap.saved) { 3673 fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__); 3674 exit(1); 3675 } 3676 pthread_mutex_unlock(&cap.mutex); 3677} 3678 3679/* 3680 * Use chroot as a weaker sandbox for environments where the process is 3681 * launched without CAP_SYS_ADMIN. 3682 */ 3683static void setup_chroot(struct lo_data *lo) 3684{ 3685 lo->proc_self_fd = open("/proc/self/fd", O_PATH); 3686 if (lo->proc_self_fd == -1) { 3687 fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n"); 3688 exit(1); 3689 } 3690 3691 /* 3692 * Make the shared directory the file system root so that FUSE_OPEN 3693 * (lo_open()) cannot escape the shared directory by opening a symlink. 3694 * 3695 * The chroot(2) syscall is later disabled by seccomp and the 3696 * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot 3697 * is not possible. 3698 * 3699 * However, it's still possible to escape the chroot via lo->proc_self_fd 3700 * but that requires first gaining control of the process. 3701 */ 3702 if (chroot(lo->source) != 0) { 3703 fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source); 3704 exit(1); 3705 } 3706 3707 /* Move into the chroot */ 3708 if (chdir("/") != 0) { 3709 fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n"); 3710 exit(1); 3711 } 3712} 3713 3714/* 3715 * Lock down this process to prevent access to other processes or files outside 3716 * source directory. This reduces the impact of arbitrary code execution bugs. 3717 */ 3718static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, 3719 bool enable_syslog) 3720{ 3721 if (lo->sandbox == SANDBOX_NAMESPACE) { 3722 setup_namespaces(lo, se); 3723 setup_mounts(lo->source); 3724 } else { 3725 setup_chroot(lo); 3726 } 3727 3728 setup_seccomp(enable_syslog); 3729 setup_capabilities(g_strdup(lo->modcaps)); 3730} 3731 3732/* Set the maximum number of open file descriptors */ 3733static void setup_nofile_rlimit(unsigned long rlimit_nofile) 3734{ 3735 struct rlimit rlim = { 3736 .rlim_cur = rlimit_nofile, 3737 .rlim_max = rlimit_nofile, 3738 }; 3739 3740 if (rlimit_nofile == 0) { 3741 return; /* nothing to do */ 3742 } 3743 3744 if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { 3745 /* Ignore SELinux denials */ 3746 if (errno == EPERM) { 3747 return; 3748 } 3749 3750 fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n"); 3751 exit(1); 3752 } 3753} 3754 3755static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) 3756{ 3757 g_autofree char *localfmt = NULL; 3758 3759 if (current_log_level < level) { 3760 return; 3761 } 3762 3763 if (current_log_level == FUSE_LOG_DEBUG) { 3764 if (use_syslog) { 3765 /* no timestamp needed */ 3766 localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), 3767 fmt); 3768 } else { 3769 g_autoptr(GDateTime) now = g_date_time_new_now_utc(); 3770 g_autofree char *nowstr = g_date_time_format(now, "%Y-%m-%d %H:%M:%S.%f%z"); 3771 localfmt = g_strdup_printf("[%s] [ID: %08ld] %s", 3772 nowstr, syscall(__NR_gettid), fmt); 3773 } 3774 fmt = localfmt; 3775 } 3776 3777 if (use_syslog) { 3778 int priority = LOG_ERR; 3779 switch (level) { 3780 case FUSE_LOG_EMERG: 3781 priority = LOG_EMERG; 3782 break; 3783 case FUSE_LOG_ALERT: 3784 priority = LOG_ALERT; 3785 break; 3786 case FUSE_LOG_CRIT: 3787 priority = LOG_CRIT; 3788 break; 3789 case FUSE_LOG_ERR: 3790 priority = LOG_ERR; 3791 break; 3792 case FUSE_LOG_WARNING: 3793 priority = LOG_WARNING; 3794 break; 3795 case FUSE_LOG_NOTICE: 3796 priority = LOG_NOTICE; 3797 break; 3798 case FUSE_LOG_INFO: 3799 priority = LOG_INFO; 3800 break; 3801 case FUSE_LOG_DEBUG: 3802 priority = LOG_DEBUG; 3803 break; 3804 } 3805 vsyslog(priority, fmt, ap); 3806 } else { 3807 vfprintf(stderr, fmt, ap); 3808 } 3809} 3810 3811static void setup_root(struct lo_data *lo, struct lo_inode *root) 3812{ 3813 int fd, res; 3814 struct stat stat; 3815 uint64_t mnt_id; 3816 3817 fd = open("/", O_PATH); 3818 if (fd == -1) { 3819 fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source); 3820 exit(1); 3821 } 3822 3823 res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, 3824 &mnt_id); 3825 if (res == -1) { 3826 fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source); 3827 exit(1); 3828 } 3829 3830 root->filetype = S_IFDIR; 3831 root->fd = fd; 3832 root->key.ino = stat.st_ino; 3833 root->key.dev = stat.st_dev; 3834 root->key.mnt_id = mnt_id; 3835 root->nlookup = 2; 3836 g_atomic_int_set(&root->refcount, 2); 3837 if (lo->posix_lock) { 3838 pthread_mutex_init(&root->plock_mutex, NULL); 3839 root->posix_locks = g_hash_table_new_full( 3840 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy); 3841 } 3842} 3843 3844static guint lo_key_hash(gconstpointer key) 3845{ 3846 const struct lo_key *lkey = key; 3847 3848 return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id; 3849} 3850 3851static gboolean lo_key_equal(gconstpointer a, gconstpointer b) 3852{ 3853 const struct lo_key *la = a; 3854 const struct lo_key *lb = b; 3855 3856 return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id; 3857} 3858 3859static void fuse_lo_data_cleanup(struct lo_data *lo) 3860{ 3861 if (lo->inodes) { 3862 g_hash_table_destroy(lo->inodes); 3863 } 3864 3865 if (lo->root.posix_locks) { 3866 g_hash_table_destroy(lo->root.posix_locks); 3867 } 3868 lo_map_destroy(&lo->fd_map); 3869 lo_map_destroy(&lo->dirp_map); 3870 lo_map_destroy(&lo->ino_map); 3871 3872 if (lo->proc_self_fd >= 0) { 3873 close(lo->proc_self_fd); 3874 } 3875 3876 if (lo->root.fd >= 0) { 3877 close(lo->root.fd); 3878 } 3879 3880 free(lo->xattrmap); 3881 free_xattrmap(lo); 3882 free(lo->xattr_security_capability); 3883 free(lo->source); 3884} 3885 3886static void qemu_version(void) 3887{ 3888 printf("virtiofsd version " QEMU_FULL_VERSION "\n" QEMU_COPYRIGHT "\n"); 3889} 3890 3891int main(int argc, char *argv[]) 3892{ 3893 struct fuse_args args = FUSE_ARGS_INIT(argc, argv); 3894 struct fuse_session *se; 3895 struct fuse_cmdline_opts opts; 3896 struct lo_data lo = { 3897 .sandbox = SANDBOX_NAMESPACE, 3898 .debug = 0, 3899 .writeback = 0, 3900 .posix_lock = 0, 3901 .allow_direct_io = 0, 3902 .proc_self_fd = -1, 3903 .user_killpriv_v2 = -1, 3904 .user_posix_acl = -1, 3905 }; 3906 struct lo_map_elem *root_elem; 3907 struct lo_map_elem *reserve_elem; 3908 int ret = -1; 3909 3910 /* Initialize time conversion information for localtime_r(). */ 3911 tzset(); 3912 3913 /* Don't mask creation mode, kernel already did that */ 3914 umask(0); 3915 3916 qemu_init_exec_dir(argv[0]); 3917 3918 pthread_mutex_init(&lo.mutex, NULL); 3919 lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal); 3920 lo.root.fd = -1; 3921 lo.root.fuse_ino = FUSE_ROOT_ID; 3922 lo.cache = CACHE_AUTO; 3923 3924 /* 3925 * Set up the ino map like this: 3926 * [0] Reserved (will not be used) 3927 * [1] Root inode 3928 */ 3929 lo_map_init(&lo.ino_map); 3930 reserve_elem = lo_map_reserve(&lo.ino_map, 0); 3931 if (!reserve_elem) { 3932 fuse_log(FUSE_LOG_ERR, "failed to alloc reserve_elem.\n"); 3933 goto err_out1; 3934 } 3935 reserve_elem->in_use = false; 3936 root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino); 3937 if (!root_elem) { 3938 fuse_log(FUSE_LOG_ERR, "failed to alloc root_elem.\n"); 3939 goto err_out1; 3940 } 3941 root_elem->inode = &lo.root; 3942 3943 lo_map_init(&lo.dirp_map); 3944 lo_map_init(&lo.fd_map); 3945 3946 if (fuse_parse_cmdline(&args, &opts) != 0) { 3947 goto err_out1; 3948 } 3949 fuse_set_log_func(log_func); 3950 use_syslog = opts.syslog; 3951 if (use_syslog) { 3952 openlog("virtiofsd", LOG_PID, LOG_DAEMON); 3953 } 3954 3955 if (opts.show_help) { 3956 printf("usage: %s [options]\n\n", argv[0]); 3957 fuse_cmdline_help(); 3958 printf(" -o source=PATH shared directory tree\n"); 3959 fuse_lowlevel_help(); 3960 ret = 0; 3961 goto err_out1; 3962 } else if (opts.show_version) { 3963 qemu_version(); 3964 fuse_lowlevel_version(); 3965 ret = 0; 3966 goto err_out1; 3967 } else if (opts.print_capabilities) { 3968 print_capabilities(); 3969 ret = 0; 3970 goto err_out1; 3971 } 3972 3973 if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { 3974 goto err_out1; 3975 } 3976 3977 if (opts.log_level != 0) { 3978 current_log_level = opts.log_level; 3979 } else { 3980 /* default log level is INFO */ 3981 current_log_level = FUSE_LOG_INFO; 3982 } 3983 lo.debug = opts.debug; 3984 if (lo.debug) { 3985 current_log_level = FUSE_LOG_DEBUG; 3986 } 3987 if (lo.source) { 3988 struct stat stat; 3989 int res; 3990 3991 res = lstat(lo.source, &stat); 3992 if (res == -1) { 3993 fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n", 3994 lo.source); 3995 exit(1); 3996 } 3997 if (!S_ISDIR(stat.st_mode)) { 3998 fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); 3999 exit(1); 4000 } 4001 } else { 4002 lo.source = strdup("/"); 4003 if (!lo.source) { 4004 fuse_log(FUSE_LOG_ERR, "failed to strdup source\n"); 4005 goto err_out1; 4006 } 4007 } 4008 4009 if (lo.xattrmap) { 4010 lo.xattr = 1; 4011 parse_xattrmap(&lo); 4012 } 4013 4014 if (!lo.timeout_set) { 4015 switch (lo.cache) { 4016 case CACHE_NONE: 4017 lo.timeout = 0.0; 4018 break; 4019 4020 case CACHE_AUTO: 4021 lo.timeout = 1.0; 4022 break; 4023 4024 case CACHE_ALWAYS: 4025 lo.timeout = 86400.0; 4026 break; 4027 } 4028 } else if (lo.timeout < 0) { 4029 fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout); 4030 exit(1); 4031 } 4032 4033 if (lo.user_posix_acl == 1 && !lo.xattr) { 4034 fuse_log(FUSE_LOG_ERR, "Can't enable posix ACLs. xattrs are disabled." 4035 "\n"); 4036 exit(1); 4037 } 4038 4039 lo.use_statx = true; 4040 4041 se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); 4042 if (se == NULL) { 4043 goto err_out1; 4044 } 4045 4046 if (fuse_set_signal_handlers(se) != 0) { 4047 goto err_out2; 4048 } 4049 4050 if (fuse_session_mount(se) != 0) { 4051 goto err_out3; 4052 } 4053 4054 fuse_daemonize(opts.foreground); 4055 4056 setup_nofile_rlimit(opts.rlimit_nofile); 4057 4058 /* Must be before sandbox since it wants /proc */ 4059 setup_capng(); 4060 4061 setup_sandbox(&lo, se, opts.syslog); 4062 4063 setup_root(&lo, &lo.root); 4064 /* Block until ctrl+c or fusermount -u */ 4065 ret = virtio_loop(se); 4066 4067 fuse_session_unmount(se); 4068 cleanup_capng(); 4069err_out3: 4070 fuse_remove_signal_handlers(se); 4071err_out2: 4072 fuse_session_destroy(se); 4073err_out1: 4074 fuse_opt_free_args(&args); 4075 4076 fuse_lo_data_cleanup(&lo); 4077 4078 return ret ? 1 : 0; 4079}