cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

passthrough_ll.c (109004B)


      1/*
      2 * FUSE: Filesystem in Userspace
      3 * Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
      4 *
      5 * This program can be distributed under the terms of the GNU GPLv2.
      6 * See the file COPYING.
      7 */
      8
      9/*
     10 *
     11 * This file system mirrors the existing file system hierarchy of the
     12 * system, starting at the root file system. This is implemented by
     13 * just "passing through" all requests to the corresponding user-space
     14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
     15 * this implementation uses the low-level API. Its performance should
     16 * be the least bad among the three, but many operations are not
     17 * implemented. In particular, it is not possible to remove files (or
     18 * directories) because the code necessary to defer actual removal
     19 * until the file is not opened anymore would make the example much
     20 * more complicated.
     21 *
     22 * When writeback caching is enabled (-o writeback mount option), it
     23 * is only possible to write to files for which the mounting user has
     24 * read permissions. This is because the writeback cache requires the
     25 * kernel to be able to issue read requests for all files (which the
     26 * passthrough filesystem cannot satisfy if it can't read the file in
     27 * the underlying filesystem).
     28 *
     29 * Compile with:
     30 *
     31 *     gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
     32 * passthrough_ll
     33 *
     34 * ## Source code ##
     35 * \include passthrough_ll.c
     36 */
     37
     38#include "qemu/osdep.h"
     39#include "qemu/timer.h"
     40#include "qemu-version.h"
     41#include "qemu-common.h"
     42#include "fuse_virtio.h"
     43#include "fuse_log.h"
     44#include "fuse_lowlevel.h"
     45#include "standard-headers/linux/fuse.h"
     46#include <cap-ng.h>
     47#include <dirent.h>
     48#include <pthread.h>
     49#include <sys/file.h>
     50#include <sys/mount.h>
     51#include <sys/prctl.h>
     52#include <sys/resource.h>
     53#include <sys/syscall.h>
     54#include <sys/wait.h>
     55#include <sys/xattr.h>
     56#include <syslog.h>
     57
     58#include "qemu/cutils.h"
     59#include "passthrough_helpers.h"
     60#include "passthrough_seccomp.h"
     61
     62/* Keep track of inode posix locks for each owner. */
     63struct lo_inode_plock {
     64    uint64_t lock_owner;
     65    int fd; /* fd for OFD locks */
     66};
     67
     68struct lo_map_elem {
     69    union {
     70        struct lo_inode *inode;
     71        struct lo_dirp *dirp;
     72        int fd;
     73        ssize_t freelist;
     74    };
     75    bool in_use;
     76};
     77
     78/* Maps FUSE fh or ino values to internal objects */
     79struct lo_map {
     80    struct lo_map_elem *elems;
     81    size_t nelems;
     82    ssize_t freelist;
     83};
     84
     85struct lo_key {
     86    ino_t ino;
     87    dev_t dev;
     88    uint64_t mnt_id;
     89};
     90
     91struct lo_inode {
     92    int fd;
     93
     94    /*
     95     * Atomic reference count for this object.  The nlookup field holds a
     96     * reference and release it when nlookup reaches 0.
     97     */
     98    gint refcount;
     99
    100    struct lo_key key;
    101
    102    /*
    103     * This counter keeps the inode alive during the FUSE session.
    104     * Incremented when the FUSE inode number is sent in a reply
    105     * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc).  Decremented when an inode is
    106     * released by a FUSE_FORGET request.
    107     *
    108     * Note that this value is untrusted because the client can manipulate
    109     * it arbitrarily using FUSE_FORGET requests.
    110     *
    111     * Protected by lo->mutex.
    112     */
    113    uint64_t nlookup;
    114
    115    fuse_ino_t fuse_ino;
    116    pthread_mutex_t plock_mutex;
    117    GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
    118
    119    mode_t filetype;
    120};
    121
    122struct lo_cred {
    123    uid_t euid;
    124    gid_t egid;
    125    mode_t umask;
    126};
    127
    128enum {
    129    CACHE_NONE,
    130    CACHE_AUTO,
    131    CACHE_ALWAYS,
    132};
    133
    134enum {
    135    SANDBOX_NAMESPACE,
    136    SANDBOX_CHROOT,
    137};
    138
    139typedef struct xattr_map_entry {
    140    char *key;
    141    char *prepend;
    142    unsigned int flags;
    143} XattrMapEntry;
    144
    145struct lo_data {
    146    pthread_mutex_t mutex;
    147    int sandbox;
    148    int debug;
    149    int writeback;
    150    int flock;
    151    int posix_lock;
    152    int xattr;
    153    char *xattrmap;
    154    char *xattr_security_capability;
    155    char *source;
    156    char *modcaps;
    157    double timeout;
    158    int cache;
    159    int timeout_set;
    160    int readdirplus_set;
    161    int readdirplus_clear;
    162    int allow_direct_io;
    163    int announce_submounts;
    164    bool use_statx;
    165    struct lo_inode root;
    166    GHashTable *inodes; /* protected by lo->mutex */
    167    struct lo_map ino_map; /* protected by lo->mutex */
    168    struct lo_map dirp_map; /* protected by lo->mutex */
    169    struct lo_map fd_map; /* protected by lo->mutex */
    170    XattrMapEntry *xattr_map_list;
    171    size_t xattr_map_nentries;
    172
    173    /* An O_PATH file descriptor to /proc/self/fd/ */
    174    int proc_self_fd;
    175    int user_killpriv_v2, killpriv_v2;
    176    /* If set, virtiofsd is responsible for setting umask during creation */
    177    bool change_umask;
    178    int user_posix_acl, posix_acl;
    179};
    180
    181static const struct fuse_opt lo_opts[] = {
    182    { "sandbox=namespace",
    183      offsetof(struct lo_data, sandbox),
    184      SANDBOX_NAMESPACE },
    185    { "sandbox=chroot",
    186      offsetof(struct lo_data, sandbox),
    187      SANDBOX_CHROOT },
    188    { "writeback", offsetof(struct lo_data, writeback), 1 },
    189    { "no_writeback", offsetof(struct lo_data, writeback), 0 },
    190    { "source=%s", offsetof(struct lo_data, source), 0 },
    191    { "flock", offsetof(struct lo_data, flock), 1 },
    192    { "no_flock", offsetof(struct lo_data, flock), 0 },
    193    { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
    194    { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
    195    { "xattr", offsetof(struct lo_data, xattr), 1 },
    196    { "no_xattr", offsetof(struct lo_data, xattr), 0 },
    197    { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 },
    198    { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 },
    199    { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
    200    { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
    201    { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
    202    { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
    203    { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
    204    { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
    205    { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
    206    { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
    207    { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
    208    { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
    209    { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 },
    210    { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 },
    211    { "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 },
    212    { "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 },
    213    FUSE_OPT_END
    214};
    215static bool use_syslog = false;
    216static int current_log_level;
    217static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
    218                                 uint64_t n);
    219
    220static struct {
    221    pthread_mutex_t mutex;
    222    void *saved;
    223} cap;
    224/* That we loaded cap-ng in the current thread from the saved */
    225static __thread bool cap_loaded = 0;
    226
    227static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
    228                                uint64_t mnt_id);
    229static int xattr_map_client(const struct lo_data *lo, const char *client_name,
    230                            char **out_name);
    231
    232static bool is_dot_or_dotdot(const char *name)
    233{
    234    return name[0] == '.' &&
    235           (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
    236}
    237
    238/* Is `path` a single path component that is not "." or ".."? */
    239static bool is_safe_path_component(const char *path)
    240{
    241    if (strchr(path, '/')) {
    242        return false;
    243    }
    244
    245    return !is_dot_or_dotdot(path);
    246}
    247
    248static bool is_empty(const char *name)
    249{
    250    return name[0] == '\0';
    251}
    252
    253static struct lo_data *lo_data(fuse_req_t req)
    254{
    255    return (struct lo_data *)fuse_req_userdata(req);
    256}
    257
    258/*
    259 * Load capng's state from our saved state if the current thread
    260 * hadn't previously been loaded.
    261 * returns 0 on success
    262 */
    263static int load_capng(void)
    264{
    265    if (!cap_loaded) {
    266        pthread_mutex_lock(&cap.mutex);
    267        capng_restore_state(&cap.saved);
    268        /*
    269         * restore_state free's the saved copy
    270         * so make another.
    271         */
    272        cap.saved = capng_save_state();
    273        if (!cap.saved) {
    274            pthread_mutex_unlock(&cap.mutex);
    275            fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
    276            return -EINVAL;
    277        }
    278        pthread_mutex_unlock(&cap.mutex);
    279
    280        /*
    281         * We want to use the loaded state for our pid,
    282         * not the original
    283         */
    284        capng_setpid(syscall(SYS_gettid));
    285        cap_loaded = true;
    286    }
    287    return 0;
    288}
    289
    290/*
    291 * Helpers for dropping and regaining effective capabilities. Returns 0
    292 * on success, error otherwise
    293 */
    294static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
    295{
    296    int cap, ret;
    297
    298    cap = capng_name_to_capability(cap_name);
    299    if (cap < 0) {
    300        ret = errno;
    301        fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
    302                 cap_name, strerror(errno));
    303        goto out;
    304    }
    305
    306    if (load_capng()) {
    307        ret = errno;
    308        fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
    309        goto out;
    310    }
    311
    312    /* We dont have this capability in effective set already. */
    313    if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
    314        ret = 0;
    315        goto out;
    316    }
    317
    318    if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
    319        ret = errno;
    320        fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
    321        goto out;
    322    }
    323
    324    if (capng_apply(CAPNG_SELECT_CAPS)) {
    325        ret = errno;
    326        fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
    327        goto out;
    328    }
    329
    330    ret = 0;
    331    if (cap_dropped) {
    332        *cap_dropped = true;
    333    }
    334
    335out:
    336    return ret;
    337}
    338
    339static int gain_effective_cap(const char *cap_name)
    340{
    341    int cap;
    342    int ret = 0;
    343
    344    cap = capng_name_to_capability(cap_name);
    345    if (cap < 0) {
    346        ret = errno;
    347        fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
    348                 cap_name, strerror(errno));
    349        goto out;
    350    }
    351
    352    if (load_capng()) {
    353        ret = errno;
    354        fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
    355        goto out;
    356    }
    357
    358    if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
    359        ret = errno;
    360        fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
    361        goto out;
    362    }
    363
    364    if (capng_apply(CAPNG_SELECT_CAPS)) {
    365        ret = errno;
    366        fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
    367        goto out;
    368    }
    369    ret = 0;
    370
    371out:
    372    return ret;
    373}
    374
    375/*
    376 * The host kernel normally drops security.capability xattr's on
    377 * any write, however if we're remapping xattr names we need to drop
    378 * whatever the clients security.capability is actually stored as.
    379 */
    380static int drop_security_capability(const struct lo_data *lo, int fd)
    381{
    382    if (!lo->xattr_security_capability) {
    383        /* We didn't remap the name, let the host kernel do it */
    384        return 0;
    385    }
    386    if (!fremovexattr(fd, lo->xattr_security_capability)) {
    387        /* All good */
    388        return 0;
    389    }
    390
    391    switch (errno) {
    392    case ENODATA:
    393        /* Attribute didn't exist, that's fine */
    394        return 0;
    395
    396    case ENOTSUP:
    397        /* FS didn't support attribute anyway, also fine */
    398        return 0;
    399
    400    default:
    401        /* Hmm other error */
    402        return errno;
    403    }
    404}
    405
    406static void lo_map_init(struct lo_map *map)
    407{
    408    map->elems = NULL;
    409    map->nelems = 0;
    410    map->freelist = -1;
    411}
    412
    413static void lo_map_destroy(struct lo_map *map)
    414{
    415    g_free(map->elems);
    416}
    417
    418static int lo_map_grow(struct lo_map *map, size_t new_nelems)
    419{
    420    struct lo_map_elem *new_elems;
    421    size_t i;
    422
    423    if (new_nelems <= map->nelems) {
    424        return 1;
    425    }
    426
    427    new_elems = g_try_realloc_n(map->elems, new_nelems, sizeof(map->elems[0]));
    428    if (!new_elems) {
    429        return 0;
    430    }
    431
    432    for (i = map->nelems; i < new_nelems; i++) {
    433        new_elems[i].freelist = i + 1;
    434        new_elems[i].in_use = false;
    435    }
    436    new_elems[new_nelems - 1].freelist = -1;
    437
    438    map->elems = new_elems;
    439    map->freelist = map->nelems;
    440    map->nelems = new_nelems;
    441    return 1;
    442}
    443
    444static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
    445{
    446    struct lo_map_elem *elem;
    447
    448    if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
    449        return NULL;
    450    }
    451
    452    elem = &map->elems[map->freelist];
    453    map->freelist = elem->freelist;
    454
    455    elem->in_use = true;
    456
    457    return elem;
    458}
    459
    460static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
    461{
    462    ssize_t *prev;
    463
    464    if (!lo_map_grow(map, key + 1)) {
    465        return NULL;
    466    }
    467
    468    for (prev = &map->freelist; *prev != -1;
    469         prev = &map->elems[*prev].freelist) {
    470        if (*prev == key) {
    471            struct lo_map_elem *elem = &map->elems[key];
    472
    473            *prev = elem->freelist;
    474            elem->in_use = true;
    475            return elem;
    476        }
    477    }
    478    return NULL;
    479}
    480
    481static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
    482{
    483    if (key >= map->nelems) {
    484        return NULL;
    485    }
    486    if (!map->elems[key].in_use) {
    487        return NULL;
    488    }
    489    return &map->elems[key];
    490}
    491
    492static void lo_map_remove(struct lo_map *map, size_t key)
    493{
    494    struct lo_map_elem *elem;
    495
    496    if (key >= map->nelems) {
    497        return;
    498    }
    499
    500    elem = &map->elems[key];
    501    if (!elem->in_use) {
    502        return;
    503    }
    504
    505    elem->in_use = false;
    506
    507    elem->freelist = map->freelist;
    508    map->freelist = key;
    509}
    510
    511/* Assumes lo->mutex is held */
    512static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd)
    513{
    514    struct lo_map_elem *elem;
    515
    516    elem = lo_map_alloc_elem(&lo->fd_map);
    517    if (!elem) {
    518        return -1;
    519    }
    520
    521    elem->fd = fd;
    522    return elem - lo->fd_map.elems;
    523}
    524
    525/* Assumes lo->mutex is held */
    526static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
    527{
    528    struct lo_map_elem *elem;
    529
    530    elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
    531    if (!elem) {
    532        return -1;
    533    }
    534
    535    elem->dirp = dirp;
    536    return elem - lo_data(req)->dirp_map.elems;
    537}
    538
    539/* Assumes lo->mutex is held */
    540static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
    541{
    542    struct lo_map_elem *elem;
    543
    544    elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
    545    if (!elem) {
    546        return -1;
    547    }
    548
    549    elem->inode = inode;
    550    return elem - lo_data(req)->ino_map.elems;
    551}
    552
    553static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
    554{
    555    struct lo_inode *inode = *inodep;
    556
    557    if (!inode) {
    558        return;
    559    }
    560
    561    *inodep = NULL;
    562
    563    if (g_atomic_int_dec_and_test(&inode->refcount)) {
    564        close(inode->fd);
    565        free(inode);
    566    }
    567}
    568
    569/* Caller must release refcount using lo_inode_put() */
    570static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
    571{
    572    struct lo_data *lo = lo_data(req);
    573    struct lo_map_elem *elem;
    574
    575    pthread_mutex_lock(&lo->mutex);
    576    elem = lo_map_get(&lo->ino_map, ino);
    577    if (elem) {
    578        g_atomic_int_inc(&elem->inode->refcount);
    579    }
    580    pthread_mutex_unlock(&lo->mutex);
    581
    582    if (!elem) {
    583        return NULL;
    584    }
    585
    586    return elem->inode;
    587}
    588
    589/*
    590 * TODO Remove this helper and force callers to hold an inode refcount until
    591 * they are done with the fd.  This will be done in a later patch to make
    592 * review easier.
    593 */
    594static int lo_fd(fuse_req_t req, fuse_ino_t ino)
    595{
    596    struct lo_inode *inode = lo_inode(req, ino);
    597    int fd;
    598
    599    if (!inode) {
    600        return -1;
    601    }
    602
    603    fd = inode->fd;
    604    lo_inode_put(lo_data(req), &inode);
    605    return fd;
    606}
    607
    608/*
    609 * Open a file descriptor for an inode. Returns -EBADF if the inode is not a
    610 * regular file or a directory.
    611 *
    612 * Use this helper function instead of raw openat(2) to prevent security issues
    613 * when a malicious client opens special files such as block device nodes.
    614 * Symlink inodes are also rejected since symlinks must already have been
    615 * traversed on the client side.
    616 */
    617static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode,
    618                         int open_flags)
    619{
    620    g_autofree char *fd_str = g_strdup_printf("%d", inode->fd);
    621    int fd;
    622
    623    if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) {
    624        return -EBADF;
    625    }
    626
    627    /*
    628     * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier
    629     * that the inode is not a special file but if an external process races
    630     * with us then symlinks are traversed here. It is not possible to escape
    631     * the shared directory since it is mounted as "/" though.
    632     */
    633    fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW);
    634    if (fd < 0) {
    635        return -errno;
    636    }
    637    return fd;
    638}
    639
    640static void lo_init(void *userdata, struct fuse_conn_info *conn)
    641{
    642    struct lo_data *lo = (struct lo_data *)userdata;
    643
    644    if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
    645        conn->want |= FUSE_CAP_EXPORT_SUPPORT;
    646    }
    647
    648    if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
    649        fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
    650        conn->want |= FUSE_CAP_WRITEBACK_CACHE;
    651    }
    652    if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
    653        if (lo->flock) {
    654            fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
    655            conn->want |= FUSE_CAP_FLOCK_LOCKS;
    656        } else {
    657            fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
    658            conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
    659        }
    660    }
    661
    662    if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
    663        if (lo->posix_lock) {
    664            fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
    665            conn->want |= FUSE_CAP_POSIX_LOCKS;
    666        } else {
    667            fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
    668            conn->want &= ~FUSE_CAP_POSIX_LOCKS;
    669        }
    670    }
    671
    672    if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
    673        lo->readdirplus_clear) {
    674        fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
    675        conn->want &= ~FUSE_CAP_READDIRPLUS;
    676    }
    677
    678    if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) {
    679        fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client "
    680                 "does not support it\n");
    681        lo->announce_submounts = false;
    682    }
    683
    684    if (lo->user_killpriv_v2 == 1) {
    685        /*
    686         * User explicitly asked for this option. Enable it unconditionally.
    687         * If connection does not have this capability, it should fail
    688         * in fuse_lowlevel.c
    689         */
    690        fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
    691        conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
    692        lo->killpriv_v2 = 1;
    693    } else if (lo->user_killpriv_v2 == -1 &&
    694               conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) {
    695        /*
    696         * User did not specify a value for killpriv_v2. By default enable it
    697         * if connection offers this capability
    698         */
    699        fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
    700        conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
    701        lo->killpriv_v2 = 1;
    702    } else {
    703        /*
    704         * Either user specified to disable killpriv_v2, or connection does
    705         * not offer this capability. Disable killpriv_v2 in both the cases
    706         */
    707        fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n");
    708        conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2;
    709        lo->killpriv_v2 = 0;
    710    }
    711
    712    if (lo->user_posix_acl == 1) {
    713        /*
    714         * User explicitly asked for this option. Enable it unconditionally.
    715         * If connection does not have this capability, print error message
    716         * now. It will fail later in fuse_lowlevel.c
    717         */
    718        if (!(conn->capable & FUSE_CAP_POSIX_ACL) ||
    719            !(conn->capable & FUSE_CAP_DONT_MASK) ||
    720            !(conn->capable & FUSE_CAP_SETXATTR_EXT)) {
    721            fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable posix acl."
    722                     " kernel does not support FUSE_POSIX_ACL, FUSE_DONT_MASK"
    723                     " or FUSE_SETXATTR_EXT capability.\n");
    724        } else {
    725            fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling posix acl\n");
    726        }
    727
    728        conn->want |= FUSE_CAP_POSIX_ACL | FUSE_CAP_DONT_MASK |
    729                      FUSE_CAP_SETXATTR_EXT;
    730        lo->change_umask = true;
    731        lo->posix_acl = true;
    732    } else {
    733        /* User either did not specify anything or wants it disabled */
    734        fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n");
    735        conn->want &= ~FUSE_CAP_POSIX_ACL;
    736    }
    737}
    738
    739static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
    740                       struct fuse_file_info *fi)
    741{
    742    int res;
    743    struct stat buf;
    744    struct lo_data *lo = lo_data(req);
    745
    746    (void)fi;
    747
    748    res =
    749        fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
    750    if (res == -1) {
    751        return (void)fuse_reply_err(req, errno);
    752    }
    753
    754    fuse_reply_attr(req, &buf, lo->timeout);
    755}
    756
    757static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
    758{
    759    struct lo_data *lo = lo_data(req);
    760    struct lo_map_elem *elem;
    761
    762    pthread_mutex_lock(&lo->mutex);
    763    elem = lo_map_get(&lo->fd_map, fi->fh);
    764    pthread_mutex_unlock(&lo->mutex);
    765
    766    if (!elem) {
    767        return -1;
    768    }
    769
    770    return elem->fd;
    771}
    772
    773static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
    774                       int valid, struct fuse_file_info *fi)
    775{
    776    int saverr;
    777    char procname[64];
    778    struct lo_data *lo = lo_data(req);
    779    struct lo_inode *inode;
    780    int ifd;
    781    int res;
    782    int fd = -1;
    783
    784    inode = lo_inode(req, ino);
    785    if (!inode) {
    786        fuse_reply_err(req, EBADF);
    787        return;
    788    }
    789
    790    ifd = inode->fd;
    791
    792    /* If fi->fh is invalid we'll report EBADF later */
    793    if (fi) {
    794        fd = lo_fi_fd(req, fi);
    795    }
    796
    797    if (valid & FUSE_SET_ATTR_MODE) {
    798        if (fi) {
    799            res = fchmod(fd, attr->st_mode);
    800        } else {
    801            sprintf(procname, "%i", ifd);
    802            res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
    803        }
    804        if (res == -1) {
    805            saverr = errno;
    806            goto out_err;
    807        }
    808    }
    809    if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
    810        uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
    811        gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
    812
    813        saverr = drop_security_capability(lo, ifd);
    814        if (saverr) {
    815            goto out_err;
    816        }
    817
    818        res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
    819        if (res == -1) {
    820            saverr = errno;
    821            goto out_err;
    822        }
    823    }
    824    if (valid & FUSE_SET_ATTR_SIZE) {
    825        int truncfd;
    826        bool kill_suidgid;
    827        bool cap_fsetid_dropped = false;
    828
    829        kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID);
    830        if (fi) {
    831            truncfd = fd;
    832        } else {
    833            truncfd = lo_inode_open(lo, inode, O_RDWR);
    834            if (truncfd < 0) {
    835                saverr = -truncfd;
    836                goto out_err;
    837            }
    838        }
    839
    840        saverr = drop_security_capability(lo, truncfd);
    841        if (saverr) {
    842            if (!fi) {
    843                close(truncfd);
    844            }
    845            goto out_err;
    846        }
    847
    848        if (kill_suidgid) {
    849            res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
    850            if (res != 0) {
    851                saverr = res;
    852                if (!fi) {
    853                    close(truncfd);
    854                }
    855                goto out_err;
    856            }
    857        }
    858
    859        res = ftruncate(truncfd, attr->st_size);
    860        saverr = res == -1 ? errno : 0;
    861
    862        if (cap_fsetid_dropped) {
    863            if (gain_effective_cap("FSETID")) {
    864                fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
    865            }
    866        }
    867        if (!fi) {
    868            close(truncfd);
    869        }
    870        if (res == -1) {
    871            goto out_err;
    872        }
    873    }
    874    if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
    875        struct timespec tv[2];
    876
    877        tv[0].tv_sec = 0;
    878        tv[1].tv_sec = 0;
    879        tv[0].tv_nsec = UTIME_OMIT;
    880        tv[1].tv_nsec = UTIME_OMIT;
    881
    882        if (valid & FUSE_SET_ATTR_ATIME_NOW) {
    883            tv[0].tv_nsec = UTIME_NOW;
    884        } else if (valid & FUSE_SET_ATTR_ATIME) {
    885            tv[0] = attr->st_atim;
    886        }
    887
    888        if (valid & FUSE_SET_ATTR_MTIME_NOW) {
    889            tv[1].tv_nsec = UTIME_NOW;
    890        } else if (valid & FUSE_SET_ATTR_MTIME) {
    891            tv[1] = attr->st_mtim;
    892        }
    893
    894        if (fi) {
    895            res = futimens(fd, tv);
    896        } else {
    897            sprintf(procname, "%i", inode->fd);
    898            res = utimensat(lo->proc_self_fd, procname, tv, 0);
    899        }
    900        if (res == -1) {
    901            saverr = errno;
    902            goto out_err;
    903        }
    904    }
    905    lo_inode_put(lo, &inode);
    906
    907    return lo_getattr(req, ino, fi);
    908
    909out_err:
    910    lo_inode_put(lo, &inode);
    911    fuse_reply_err(req, saverr);
    912}
    913
    914static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
    915                                uint64_t mnt_id)
    916{
    917    struct lo_inode *p;
    918    struct lo_key key = {
    919        .ino = st->st_ino,
    920        .dev = st->st_dev,
    921        .mnt_id = mnt_id,
    922    };
    923
    924    pthread_mutex_lock(&lo->mutex);
    925    p = g_hash_table_lookup(lo->inodes, &key);
    926    if (p) {
    927        assert(p->nlookup > 0);
    928        p->nlookup++;
    929        g_atomic_int_inc(&p->refcount);
    930    }
    931    pthread_mutex_unlock(&lo->mutex);
    932
    933    return p;
    934}
    935
    936/* value_destroy_func for posix_locks GHashTable */
    937static void posix_locks_value_destroy(gpointer data)
    938{
    939    struct lo_inode_plock *plock = data;
    940
    941    /*
    942     * We had used open() for locks and had only one fd. So
    943     * closing this fd should release all OFD locks.
    944     */
    945    close(plock->fd);
    946    free(plock);
    947}
    948
    949static int do_statx(struct lo_data *lo, int dirfd, const char *pathname,
    950                    struct stat *statbuf, int flags, uint64_t *mnt_id)
    951{
    952    int res;
    953
    954#if defined(CONFIG_STATX) && defined(STATX_MNT_ID)
    955    if (lo->use_statx) {
    956        struct statx statxbuf;
    957
    958        res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID,
    959                    &statxbuf);
    960        if (!res) {
    961            memset(statbuf, 0, sizeof(*statbuf));
    962            statbuf->st_dev = makedev(statxbuf.stx_dev_major,
    963                                      statxbuf.stx_dev_minor);
    964            statbuf->st_ino = statxbuf.stx_ino;
    965            statbuf->st_mode = statxbuf.stx_mode;
    966            statbuf->st_nlink = statxbuf.stx_nlink;
    967            statbuf->st_uid = statxbuf.stx_uid;
    968            statbuf->st_gid = statxbuf.stx_gid;
    969            statbuf->st_rdev = makedev(statxbuf.stx_rdev_major,
    970                                       statxbuf.stx_rdev_minor);
    971            statbuf->st_size = statxbuf.stx_size;
    972            statbuf->st_blksize = statxbuf.stx_blksize;
    973            statbuf->st_blocks = statxbuf.stx_blocks;
    974            statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec;
    975            statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec;
    976            statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec;
    977            statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec;
    978            statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec;
    979            statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec;
    980
    981            if (statxbuf.stx_mask & STATX_MNT_ID) {
    982                *mnt_id = statxbuf.stx_mnt_id;
    983            } else {
    984                *mnt_id = 0;
    985            }
    986            return 0;
    987        } else if (errno != ENOSYS) {
    988            return -1;
    989        }
    990        lo->use_statx = false;
    991        /* fallback */
    992    }
    993#endif
    994    res = fstatat(dirfd, pathname, statbuf, flags);
    995    if (res == -1) {
    996        return -1;
    997    }
    998    *mnt_id = 0;
    999
   1000    return 0;
   1001}
   1002
   1003/*
   1004 * Increments nlookup on the inode on success. unref_inode_lolocked() must be
   1005 * called eventually to decrement nlookup again. If inodep is non-NULL, the
   1006 * inode pointer is stored and the caller must call lo_inode_put().
   1007 */
   1008static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
   1009                        struct fuse_entry_param *e,
   1010                        struct lo_inode **inodep)
   1011{
   1012    int newfd;
   1013    int res;
   1014    int saverr;
   1015    uint64_t mnt_id;
   1016    struct lo_data *lo = lo_data(req);
   1017    struct lo_inode *inode = NULL;
   1018    struct lo_inode *dir = lo_inode(req, parent);
   1019
   1020    if (inodep) {
   1021        *inodep = NULL; /* in case there is an error */
   1022    }
   1023
   1024    /*
   1025     * name_to_handle_at() and open_by_handle_at() can reach here with fuse
   1026     * mount point in guest, but we don't have its inode info in the
   1027     * ino_map.
   1028     */
   1029    if (!dir) {
   1030        return ENOENT;
   1031    }
   1032
   1033    memset(e, 0, sizeof(*e));
   1034    e->attr_timeout = lo->timeout;
   1035    e->entry_timeout = lo->timeout;
   1036
   1037    /* Do not allow escaping root directory */
   1038    if (dir == &lo->root && strcmp(name, "..") == 0) {
   1039        name = ".";
   1040    }
   1041
   1042    newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
   1043    if (newfd == -1) {
   1044        goto out_err;
   1045    }
   1046
   1047    res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
   1048                   &mnt_id);
   1049    if (res == -1) {
   1050        goto out_err;
   1051    }
   1052
   1053    if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts &&
   1054        (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) {
   1055        e->attr_flags |= FUSE_ATTR_SUBMOUNT;
   1056    }
   1057
   1058    inode = lo_find(lo, &e->attr, mnt_id);
   1059    if (inode) {
   1060        close(newfd);
   1061    } else {
   1062        inode = calloc(1, sizeof(struct lo_inode));
   1063        if (!inode) {
   1064            goto out_err;
   1065        }
   1066
   1067        /* cache only filetype */
   1068        inode->filetype = (e->attr.st_mode & S_IFMT);
   1069
   1070        /*
   1071         * One for the caller and one for nlookup (released in
   1072         * unref_inode_lolocked())
   1073         */
   1074        g_atomic_int_set(&inode->refcount, 2);
   1075
   1076        inode->nlookup = 1;
   1077        inode->fd = newfd;
   1078        inode->key.ino = e->attr.st_ino;
   1079        inode->key.dev = e->attr.st_dev;
   1080        inode->key.mnt_id = mnt_id;
   1081        if (lo->posix_lock) {
   1082            pthread_mutex_init(&inode->plock_mutex, NULL);
   1083            inode->posix_locks = g_hash_table_new_full(
   1084                g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
   1085        }
   1086        pthread_mutex_lock(&lo->mutex);
   1087        inode->fuse_ino = lo_add_inode_mapping(req, inode);
   1088        g_hash_table_insert(lo->inodes, &inode->key, inode);
   1089        pthread_mutex_unlock(&lo->mutex);
   1090    }
   1091    e->ino = inode->fuse_ino;
   1092
   1093    /* Transfer ownership of inode pointer to caller or drop it */
   1094    if (inodep) {
   1095        *inodep = inode;
   1096    } else {
   1097        lo_inode_put(lo, &inode);
   1098    }
   1099
   1100    lo_inode_put(lo, &dir);
   1101
   1102    fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
   1103             name, (unsigned long long)e->ino);
   1104
   1105    return 0;
   1106
   1107out_err:
   1108    saverr = errno;
   1109    if (newfd != -1) {
   1110        close(newfd);
   1111    }
   1112    lo_inode_put(lo, &inode);
   1113    lo_inode_put(lo, &dir);
   1114    return saverr;
   1115}
   1116
   1117static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
   1118{
   1119    struct fuse_entry_param e;
   1120    int err;
   1121
   1122    fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
   1123             name);
   1124
   1125    if (is_empty(name)) {
   1126        fuse_reply_err(req, ENOENT);
   1127        return;
   1128    }
   1129
   1130    /*
   1131     * Don't use is_safe_path_component(), allow "." and ".." for NFS export
   1132     * support.
   1133     */
   1134    if (strchr(name, '/')) {
   1135        fuse_reply_err(req, EINVAL);
   1136        return;
   1137    }
   1138
   1139    err = lo_do_lookup(req, parent, name, &e, NULL);
   1140    if (err) {
   1141        fuse_reply_err(req, err);
   1142    } else {
   1143        fuse_reply_entry(req, &e);
   1144    }
   1145}
   1146
   1147/*
   1148 * On some archs, setres*id is limited to 2^16 but they
   1149 * provide setres*id32 variants that allow 2^32.
   1150 * Others just let setres*id do 2^32 anyway.
   1151 */
   1152#ifdef SYS_setresgid32
   1153#define OURSYS_setresgid SYS_setresgid32
   1154#else
   1155#define OURSYS_setresgid SYS_setresgid
   1156#endif
   1157
   1158#ifdef SYS_setresuid32
   1159#define OURSYS_setresuid SYS_setresuid32
   1160#else
   1161#define OURSYS_setresuid SYS_setresuid
   1162#endif
   1163
   1164/*
   1165 * Change to uid/gid of caller so that file is created with
   1166 * ownership of caller.
   1167 * TODO: What about selinux context?
   1168 */
   1169static int lo_change_cred(fuse_req_t req, struct lo_cred *old,
   1170                          bool change_umask)
   1171{
   1172    int res;
   1173
   1174    old->euid = geteuid();
   1175    old->egid = getegid();
   1176
   1177    res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
   1178    if (res == -1) {
   1179        return errno;
   1180    }
   1181
   1182    res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
   1183    if (res == -1) {
   1184        int errno_save = errno;
   1185
   1186        syscall(OURSYS_setresgid, -1, old->egid, -1);
   1187        return errno_save;
   1188    }
   1189
   1190    if (change_umask) {
   1191        old->umask = umask(req->ctx.umask);
   1192    }
   1193    return 0;
   1194}
   1195
   1196/* Regain Privileges */
   1197static void lo_restore_cred(struct lo_cred *old, bool restore_umask)
   1198{
   1199    int res;
   1200
   1201    res = syscall(OURSYS_setresuid, -1, old->euid, -1);
   1202    if (res == -1) {
   1203        fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
   1204        exit(1);
   1205    }
   1206
   1207    res = syscall(OURSYS_setresgid, -1, old->egid, -1);
   1208    if (res == -1) {
   1209        fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
   1210        exit(1);
   1211    }
   1212
   1213    if (restore_umask)
   1214        umask(old->umask);
   1215}
   1216
   1217/*
   1218 * A helper to change cred and drop capability. Returns 0 on success and
   1219 * errno on error
   1220 */
   1221static int lo_drop_cap_change_cred(fuse_req_t req, struct lo_cred *old,
   1222                                   bool change_umask, const char *cap_name,
   1223                                   bool *cap_dropped)
   1224{
   1225    int ret;
   1226    bool __cap_dropped;
   1227
   1228    assert(cap_name);
   1229
   1230    ret = drop_effective_cap(cap_name, &__cap_dropped);
   1231    if (ret) {
   1232        return ret;
   1233    }
   1234
   1235    ret = lo_change_cred(req, old, change_umask);
   1236    if (ret) {
   1237        if (__cap_dropped) {
   1238            if (gain_effective_cap(cap_name)) {
   1239                fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
   1240            }
   1241        }
   1242    }
   1243
   1244    if (cap_dropped) {
   1245        *cap_dropped = __cap_dropped;
   1246    }
   1247    return ret;
   1248}
   1249
   1250static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask,
   1251                                     const char *cap_name)
   1252{
   1253    assert(cap_name);
   1254
   1255    lo_restore_cred(old, restore_umask);
   1256
   1257    if (gain_effective_cap(cap_name)) {
   1258        fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
   1259    }
   1260}
   1261
   1262static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
   1263                             const char *name, mode_t mode, dev_t rdev,
   1264                             const char *link)
   1265{
   1266    int res;
   1267    int saverr;
   1268    struct lo_data *lo = lo_data(req);
   1269    struct lo_inode *dir;
   1270    struct fuse_entry_param e;
   1271    struct lo_cred old = {};
   1272
   1273    if (is_empty(name)) {
   1274        fuse_reply_err(req, ENOENT);
   1275        return;
   1276    }
   1277
   1278    if (!is_safe_path_component(name)) {
   1279        fuse_reply_err(req, EINVAL);
   1280        return;
   1281    }
   1282
   1283    dir = lo_inode(req, parent);
   1284    if (!dir) {
   1285        fuse_reply_err(req, EBADF);
   1286        return;
   1287    }
   1288
   1289    saverr = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode));
   1290    if (saverr) {
   1291        goto out;
   1292    }
   1293
   1294    res = mknod_wrapper(dir->fd, name, link, mode, rdev);
   1295
   1296    saverr = errno;
   1297
   1298    lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode));
   1299
   1300    if (res == -1) {
   1301        goto out;
   1302    }
   1303
   1304    saverr = lo_do_lookup(req, parent, name, &e, NULL);
   1305    if (saverr) {
   1306        goto out;
   1307    }
   1308
   1309    fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
   1310             name, (unsigned long long)e.ino);
   1311
   1312    fuse_reply_entry(req, &e);
   1313    lo_inode_put(lo, &dir);
   1314    return;
   1315
   1316out:
   1317    lo_inode_put(lo, &dir);
   1318    fuse_reply_err(req, saverr);
   1319}
   1320
   1321static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
   1322                     mode_t mode, dev_t rdev)
   1323{
   1324    lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
   1325}
   1326
   1327static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
   1328                     mode_t mode)
   1329{
   1330    lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
   1331}
   1332
   1333static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
   1334                       const char *name)
   1335{
   1336    lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
   1337}
   1338
   1339static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
   1340                    const char *name)
   1341{
   1342    int res;
   1343    struct lo_data *lo = lo_data(req);
   1344    struct lo_inode *parent_inode;
   1345    struct lo_inode *inode;
   1346    struct fuse_entry_param e;
   1347    char procname[64];
   1348    int saverr;
   1349
   1350    if (is_empty(name)) {
   1351        fuse_reply_err(req, ENOENT);
   1352        return;
   1353    }
   1354
   1355    if (!is_safe_path_component(name)) {
   1356        fuse_reply_err(req, EINVAL);
   1357        return;
   1358    }
   1359
   1360    parent_inode = lo_inode(req, parent);
   1361    inode = lo_inode(req, ino);
   1362    if (!parent_inode || !inode) {
   1363        errno = EBADF;
   1364        goto out_err;
   1365    }
   1366
   1367    memset(&e, 0, sizeof(struct fuse_entry_param));
   1368    e.attr_timeout = lo->timeout;
   1369    e.entry_timeout = lo->timeout;
   1370
   1371    sprintf(procname, "%i", inode->fd);
   1372    res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name,
   1373                 AT_SYMLINK_FOLLOW);
   1374    if (res == -1) {
   1375        goto out_err;
   1376    }
   1377
   1378    res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
   1379    if (res == -1) {
   1380        goto out_err;
   1381    }
   1382
   1383    pthread_mutex_lock(&lo->mutex);
   1384    inode->nlookup++;
   1385    pthread_mutex_unlock(&lo->mutex);
   1386    e.ino = inode->fuse_ino;
   1387
   1388    fuse_log(FUSE_LOG_DEBUG, "  %lli/%s -> %lli\n", (unsigned long long)parent,
   1389             name, (unsigned long long)e.ino);
   1390
   1391    fuse_reply_entry(req, &e);
   1392    lo_inode_put(lo, &parent_inode);
   1393    lo_inode_put(lo, &inode);
   1394    return;
   1395
   1396out_err:
   1397    saverr = errno;
   1398    lo_inode_put(lo, &parent_inode);
   1399    lo_inode_put(lo, &inode);
   1400    fuse_reply_err(req, saverr);
   1401}
   1402
   1403/* Increments nlookup and caller must release refcount using lo_inode_put() */
   1404static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
   1405                                    const char *name)
   1406{
   1407    int res;
   1408    uint64_t mnt_id;
   1409    struct stat attr;
   1410    struct lo_data *lo = lo_data(req);
   1411    struct lo_inode *dir = lo_inode(req, parent);
   1412
   1413    if (!dir) {
   1414        return NULL;
   1415    }
   1416
   1417    res = do_statx(lo, dir->fd, name, &attr, AT_SYMLINK_NOFOLLOW, &mnt_id);
   1418    lo_inode_put(lo, &dir);
   1419    if (res == -1) {
   1420        return NULL;
   1421    }
   1422
   1423    return lo_find(lo, &attr, mnt_id);
   1424}
   1425
   1426static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
   1427{
   1428    int res;
   1429    struct lo_inode *inode;
   1430    struct lo_data *lo = lo_data(req);
   1431
   1432    if (is_empty(name)) {
   1433        fuse_reply_err(req, ENOENT);
   1434        return;
   1435    }
   1436
   1437    if (!is_safe_path_component(name)) {
   1438        fuse_reply_err(req, EINVAL);
   1439        return;
   1440    }
   1441
   1442    inode = lookup_name(req, parent, name);
   1443    if (!inode) {
   1444        fuse_reply_err(req, EIO);
   1445        return;
   1446    }
   1447
   1448    res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
   1449
   1450    fuse_reply_err(req, res == -1 ? errno : 0);
   1451    unref_inode_lolocked(lo, inode, 1);
   1452    lo_inode_put(lo, &inode);
   1453}
   1454
   1455static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
   1456                      fuse_ino_t newparent, const char *newname,
   1457                      unsigned int flags)
   1458{
   1459    int res;
   1460    struct lo_inode *parent_inode;
   1461    struct lo_inode *newparent_inode;
   1462    struct lo_inode *oldinode = NULL;
   1463    struct lo_inode *newinode = NULL;
   1464    struct lo_data *lo = lo_data(req);
   1465
   1466    if (is_empty(name) || is_empty(newname)) {
   1467        fuse_reply_err(req, ENOENT);
   1468        return;
   1469    }
   1470
   1471    if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
   1472        fuse_reply_err(req, EINVAL);
   1473        return;
   1474    }
   1475
   1476    parent_inode = lo_inode(req, parent);
   1477    newparent_inode = lo_inode(req, newparent);
   1478    if (!parent_inode || !newparent_inode) {
   1479        fuse_reply_err(req, EBADF);
   1480        goto out;
   1481    }
   1482
   1483    oldinode = lookup_name(req, parent, name);
   1484    newinode = lookup_name(req, newparent, newname);
   1485
   1486    if (!oldinode) {
   1487        fuse_reply_err(req, EIO);
   1488        goto out;
   1489    }
   1490
   1491    if (flags) {
   1492#ifndef SYS_renameat2
   1493        fuse_reply_err(req, EINVAL);
   1494#else
   1495        res = syscall(SYS_renameat2, parent_inode->fd, name,
   1496                        newparent_inode->fd, newname, flags);
   1497        if (res == -1 && errno == ENOSYS) {
   1498            fuse_reply_err(req, EINVAL);
   1499        } else {
   1500            fuse_reply_err(req, res == -1 ? errno : 0);
   1501        }
   1502#endif
   1503        goto out;
   1504    }
   1505
   1506    res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
   1507
   1508    fuse_reply_err(req, res == -1 ? errno : 0);
   1509out:
   1510    unref_inode_lolocked(lo, oldinode, 1);
   1511    unref_inode_lolocked(lo, newinode, 1);
   1512    lo_inode_put(lo, &oldinode);
   1513    lo_inode_put(lo, &newinode);
   1514    lo_inode_put(lo, &parent_inode);
   1515    lo_inode_put(lo, &newparent_inode);
   1516}
   1517
   1518static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
   1519{
   1520    int res;
   1521    struct lo_inode *inode;
   1522    struct lo_data *lo = lo_data(req);
   1523
   1524    if (is_empty(name)) {
   1525        fuse_reply_err(req, ENOENT);
   1526        return;
   1527    }
   1528
   1529    if (!is_safe_path_component(name)) {
   1530        fuse_reply_err(req, EINVAL);
   1531        return;
   1532    }
   1533
   1534    inode = lookup_name(req, parent, name);
   1535    if (!inode) {
   1536        fuse_reply_err(req, EIO);
   1537        return;
   1538    }
   1539
   1540    res = unlinkat(lo_fd(req, parent), name, 0);
   1541
   1542    fuse_reply_err(req, res == -1 ? errno : 0);
   1543    unref_inode_lolocked(lo, inode, 1);
   1544    lo_inode_put(lo, &inode);
   1545}
   1546
   1547/* To be called with lo->mutex held */
   1548static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
   1549{
   1550    if (!inode) {
   1551        return;
   1552    }
   1553
   1554    assert(inode->nlookup >= n);
   1555    inode->nlookup -= n;
   1556    if (!inode->nlookup) {
   1557        lo_map_remove(&lo->ino_map, inode->fuse_ino);
   1558        g_hash_table_remove(lo->inodes, &inode->key);
   1559        if (lo->posix_lock) {
   1560            if (g_hash_table_size(inode->posix_locks)) {
   1561                fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
   1562            }
   1563            g_hash_table_destroy(inode->posix_locks);
   1564            pthread_mutex_destroy(&inode->plock_mutex);
   1565        }
   1566        /* Drop our refcount from lo_do_lookup() */
   1567        lo_inode_put(lo, &inode);
   1568    }
   1569}
   1570
   1571static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
   1572                                 uint64_t n)
   1573{
   1574    if (!inode) {
   1575        return;
   1576    }
   1577
   1578    pthread_mutex_lock(&lo->mutex);
   1579    unref_inode(lo, inode, n);
   1580    pthread_mutex_unlock(&lo->mutex);
   1581}
   1582
   1583static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
   1584{
   1585    struct lo_data *lo = lo_data(req);
   1586    struct lo_inode *inode;
   1587
   1588    inode = lo_inode(req, ino);
   1589    if (!inode) {
   1590        return;
   1591    }
   1592
   1593    fuse_log(FUSE_LOG_DEBUG, "  forget %lli %lli -%lli\n",
   1594             (unsigned long long)ino, (unsigned long long)inode->nlookup,
   1595             (unsigned long long)nlookup);
   1596
   1597    unref_inode_lolocked(lo, inode, nlookup);
   1598    lo_inode_put(lo, &inode);
   1599}
   1600
   1601static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
   1602{
   1603    lo_forget_one(req, ino, nlookup);
   1604    fuse_reply_none(req);
   1605}
   1606
   1607static void lo_forget_multi(fuse_req_t req, size_t count,
   1608                            struct fuse_forget_data *forgets)
   1609{
   1610    int i;
   1611
   1612    for (i = 0; i < count; i++) {
   1613        lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
   1614    }
   1615    fuse_reply_none(req);
   1616}
   1617
   1618static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
   1619{
   1620    char buf[PATH_MAX + 1];
   1621    int res;
   1622
   1623    res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
   1624    if (res == -1) {
   1625        return (void)fuse_reply_err(req, errno);
   1626    }
   1627
   1628    if (res == sizeof(buf)) {
   1629        return (void)fuse_reply_err(req, ENAMETOOLONG);
   1630    }
   1631
   1632    buf[res] = '\0';
   1633
   1634    fuse_reply_readlink(req, buf);
   1635}
   1636
   1637struct lo_dirp {
   1638    gint refcount;
   1639    DIR *dp;
   1640    struct dirent *entry;
   1641    off_t offset;
   1642};
   1643
   1644static void lo_dirp_put(struct lo_dirp **dp)
   1645{
   1646    struct lo_dirp *d = *dp;
   1647
   1648    if (!d) {
   1649        return;
   1650    }
   1651    *dp = NULL;
   1652
   1653    if (g_atomic_int_dec_and_test(&d->refcount)) {
   1654        closedir(d->dp);
   1655        free(d);
   1656    }
   1657}
   1658
   1659/* Call lo_dirp_put() on the return value when no longer needed */
   1660static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
   1661{
   1662    struct lo_data *lo = lo_data(req);
   1663    struct lo_map_elem *elem;
   1664
   1665    pthread_mutex_lock(&lo->mutex);
   1666    elem = lo_map_get(&lo->dirp_map, fi->fh);
   1667    if (elem) {
   1668        g_atomic_int_inc(&elem->dirp->refcount);
   1669    }
   1670    pthread_mutex_unlock(&lo->mutex);
   1671    if (!elem) {
   1672        return NULL;
   1673    }
   1674
   1675    return elem->dirp;
   1676}
   1677
   1678static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
   1679                       struct fuse_file_info *fi)
   1680{
   1681    int error = ENOMEM;
   1682    struct lo_data *lo = lo_data(req);
   1683    struct lo_dirp *d;
   1684    int fd;
   1685    ssize_t fh;
   1686
   1687    d = calloc(1, sizeof(struct lo_dirp));
   1688    if (d == NULL) {
   1689        goto out_err;
   1690    }
   1691
   1692    fd = openat(lo_fd(req, ino), ".", O_RDONLY);
   1693    if (fd == -1) {
   1694        goto out_errno;
   1695    }
   1696
   1697    d->dp = fdopendir(fd);
   1698    if (d->dp == NULL) {
   1699        goto out_errno;
   1700    }
   1701
   1702    d->offset = 0;
   1703    d->entry = NULL;
   1704
   1705    g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
   1706    pthread_mutex_lock(&lo->mutex);
   1707    fh = lo_add_dirp_mapping(req, d);
   1708    pthread_mutex_unlock(&lo->mutex);
   1709    if (fh == -1) {
   1710        goto out_err;
   1711    }
   1712
   1713    fi->fh = fh;
   1714    if (lo->cache == CACHE_ALWAYS) {
   1715        fi->cache_readdir = 1;
   1716    }
   1717    fuse_reply_open(req, fi);
   1718    return;
   1719
   1720out_errno:
   1721    error = errno;
   1722out_err:
   1723    if (d) {
   1724        if (d->dp) {
   1725            closedir(d->dp);
   1726        } else if (fd != -1) {
   1727            close(fd);
   1728        }
   1729        free(d);
   1730    }
   1731    fuse_reply_err(req, error);
   1732}
   1733
   1734static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
   1735                          off_t offset, struct fuse_file_info *fi, int plus)
   1736{
   1737    struct lo_data *lo = lo_data(req);
   1738    struct lo_dirp *d = NULL;
   1739    struct lo_inode *dinode;
   1740    g_autofree char *buf = NULL;
   1741    char *p;
   1742    size_t rem = size;
   1743    int err = EBADF;
   1744
   1745    dinode = lo_inode(req, ino);
   1746    if (!dinode) {
   1747        goto error;
   1748    }
   1749
   1750    d = lo_dirp(req, fi);
   1751    if (!d) {
   1752        goto error;
   1753    }
   1754
   1755    err = ENOMEM;
   1756    buf = g_try_malloc0(size);
   1757    if (!buf) {
   1758        goto error;
   1759    }
   1760    p = buf;
   1761
   1762    if (offset != d->offset) {
   1763        seekdir(d->dp, offset);
   1764        d->entry = NULL;
   1765        d->offset = offset;
   1766    }
   1767    while (1) {
   1768        size_t entsize;
   1769        off_t nextoff;
   1770        const char *name;
   1771
   1772        if (!d->entry) {
   1773            errno = 0;
   1774            d->entry = readdir(d->dp);
   1775            if (!d->entry) {
   1776                if (errno) { /* Error */
   1777                    err = errno;
   1778                    goto error;
   1779                } else { /* End of stream */
   1780                    break;
   1781                }
   1782            }
   1783        }
   1784        nextoff = d->entry->d_off;
   1785        name = d->entry->d_name;
   1786
   1787        fuse_ino_t entry_ino = 0;
   1788        struct fuse_entry_param e = (struct fuse_entry_param){
   1789            .attr.st_ino = d->entry->d_ino,
   1790            .attr.st_mode = d->entry->d_type << 12,
   1791        };
   1792
   1793        /* Hide root's parent directory */
   1794        if (dinode == &lo->root && strcmp(name, "..") == 0) {
   1795            e.attr.st_ino = lo->root.key.ino;
   1796            e.attr.st_mode = DT_DIR << 12;
   1797        }
   1798
   1799        if (plus) {
   1800            if (!is_dot_or_dotdot(name)) {
   1801                err = lo_do_lookup(req, ino, name, &e, NULL);
   1802                if (err) {
   1803                    goto error;
   1804                }
   1805                entry_ino = e.ino;
   1806            }
   1807
   1808            entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
   1809        } else {
   1810            entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
   1811        }
   1812        if (entsize > rem) {
   1813            if (entry_ino != 0) {
   1814                lo_forget_one(req, entry_ino, 1);
   1815            }
   1816            break;
   1817        }
   1818
   1819        p += entsize;
   1820        rem -= entsize;
   1821
   1822        d->entry = NULL;
   1823        d->offset = nextoff;
   1824    }
   1825
   1826    err = 0;
   1827error:
   1828    lo_dirp_put(&d);
   1829    lo_inode_put(lo, &dinode);
   1830
   1831    /*
   1832     * If there's an error, we can only signal it if we haven't stored
   1833     * any entries yet - otherwise we'd end up with wrong lookup
   1834     * counts for the entries that are already in the buffer. So we
   1835     * return what we've collected until that point.
   1836     */
   1837    if (err && rem == size) {
   1838        fuse_reply_err(req, err);
   1839    } else {
   1840        fuse_reply_buf(req, buf, size - rem);
   1841    }
   1842}
   1843
   1844static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
   1845                       off_t offset, struct fuse_file_info *fi)
   1846{
   1847    lo_do_readdir(req, ino, size, offset, fi, 0);
   1848}
   1849
   1850static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
   1851                           off_t offset, struct fuse_file_info *fi)
   1852{
   1853    lo_do_readdir(req, ino, size, offset, fi, 1);
   1854}
   1855
   1856static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
   1857                          struct fuse_file_info *fi)
   1858{
   1859    struct lo_data *lo = lo_data(req);
   1860    struct lo_map_elem *elem;
   1861    struct lo_dirp *d;
   1862
   1863    (void)ino;
   1864
   1865    pthread_mutex_lock(&lo->mutex);
   1866    elem = lo_map_get(&lo->dirp_map, fi->fh);
   1867    if (!elem) {
   1868        pthread_mutex_unlock(&lo->mutex);
   1869        fuse_reply_err(req, EBADF);
   1870        return;
   1871    }
   1872
   1873    d = elem->dirp;
   1874    lo_map_remove(&lo->dirp_map, fi->fh);
   1875    pthread_mutex_unlock(&lo->mutex);
   1876
   1877    lo_dirp_put(&d); /* paired with lo_opendir() */
   1878
   1879    fuse_reply_err(req, 0);
   1880}
   1881
   1882static void update_open_flags(int writeback, int allow_direct_io,
   1883                              struct fuse_file_info *fi)
   1884{
   1885    /*
   1886     * With writeback cache, kernel may send read requests even
   1887     * when userspace opened write-only
   1888     */
   1889    if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
   1890        fi->flags &= ~O_ACCMODE;
   1891        fi->flags |= O_RDWR;
   1892    }
   1893
   1894    /*
   1895     * With writeback cache, O_APPEND is handled by the kernel.
   1896     * This breaks atomicity (since the file may change in the
   1897     * underlying filesystem, so that the kernel's idea of the
   1898     * end of the file isn't accurate anymore). In this example,
   1899     * we just accept that. A more rigorous filesystem may want
   1900     * to return an error here
   1901     */
   1902    if (writeback && (fi->flags & O_APPEND)) {
   1903        fi->flags &= ~O_APPEND;
   1904    }
   1905
   1906    /*
   1907     * O_DIRECT in guest should not necessarily mean bypassing page
   1908     * cache on host as well. Therefore, we discard it by default
   1909     * ('-o no_allow_direct_io'). If somebody needs that behavior,
   1910     * the '-o allow_direct_io' option should be set.
   1911     */
   1912    if (!allow_direct_io) {
   1913        fi->flags &= ~O_DIRECT;
   1914    }
   1915}
   1916
   1917/*
   1918 * Open a regular file, set up an fd mapping, and fill out the struct
   1919 * fuse_file_info for it. If existing_fd is not negative, use that fd instead
   1920 * opening a new one. Takes ownership of existing_fd.
   1921 *
   1922 * Returns 0 on success or a positive errno.
   1923 */
   1924static int lo_do_open(struct lo_data *lo, struct lo_inode *inode,
   1925                      int existing_fd, struct fuse_file_info *fi)
   1926{
   1927    ssize_t fh;
   1928    int fd = existing_fd;
   1929    int err;
   1930    bool cap_fsetid_dropped = false;
   1931    bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv;
   1932
   1933    update_open_flags(lo->writeback, lo->allow_direct_io, fi);
   1934
   1935    if (fd < 0) {
   1936        if (kill_suidgid) {
   1937            err = drop_effective_cap("FSETID", &cap_fsetid_dropped);
   1938            if (err) {
   1939                return err;
   1940            }
   1941        }
   1942
   1943        fd = lo_inode_open(lo, inode, fi->flags);
   1944
   1945        if (cap_fsetid_dropped) {
   1946            if (gain_effective_cap("FSETID")) {
   1947                fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
   1948            }
   1949        }
   1950        if (fd < 0) {
   1951            return -fd;
   1952        }
   1953        if (fi->flags & (O_TRUNC)) {
   1954            int err = drop_security_capability(lo, fd);
   1955            if (err) {
   1956                close(fd);
   1957                return err;
   1958            }
   1959        }
   1960    }
   1961
   1962    pthread_mutex_lock(&lo->mutex);
   1963    fh = lo_add_fd_mapping(lo, fd);
   1964    pthread_mutex_unlock(&lo->mutex);
   1965    if (fh == -1) {
   1966        close(fd);
   1967        return ENOMEM;
   1968    }
   1969
   1970    fi->fh = fh;
   1971    if (lo->cache == CACHE_NONE) {
   1972        fi->direct_io = 1;
   1973    } else if (lo->cache == CACHE_ALWAYS) {
   1974        fi->keep_cache = 1;
   1975    }
   1976    return 0;
   1977}
   1978
   1979static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
   1980                      mode_t mode, struct fuse_file_info *fi)
   1981{
   1982    int fd = -1;
   1983    struct lo_data *lo = lo_data(req);
   1984    struct lo_inode *parent_inode;
   1985    struct lo_inode *inode = NULL;
   1986    struct fuse_entry_param e;
   1987    int err;
   1988    struct lo_cred old = {};
   1989
   1990    fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)"
   1991             " kill_priv=%d\n", parent, name, fi->kill_priv);
   1992
   1993    if (!is_safe_path_component(name)) {
   1994        fuse_reply_err(req, EINVAL);
   1995        return;
   1996    }
   1997
   1998    parent_inode = lo_inode(req, parent);
   1999    if (!parent_inode) {
   2000        fuse_reply_err(req, EBADF);
   2001        return;
   2002    }
   2003
   2004    err = lo_change_cred(req, &old, lo->change_umask);
   2005    if (err) {
   2006        goto out;
   2007    }
   2008
   2009    update_open_flags(lo->writeback, lo->allow_direct_io, fi);
   2010
   2011    /* Try to create a new file but don't open existing files */
   2012    fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode);
   2013    err = fd == -1 ? errno : 0;
   2014
   2015    lo_restore_cred(&old, lo->change_umask);
   2016
   2017    /* Ignore the error if file exists and O_EXCL was not given */
   2018    if (err && (err != EEXIST || (fi->flags & O_EXCL))) {
   2019        goto out;
   2020    }
   2021
   2022    err = lo_do_lookup(req, parent, name, &e, &inode);
   2023    if (err) {
   2024        goto out;
   2025    }
   2026
   2027    err = lo_do_open(lo, inode, fd, fi);
   2028    fd = -1; /* lo_do_open() takes ownership of fd */
   2029    if (err) {
   2030        /* Undo lo_do_lookup() nlookup ref */
   2031        unref_inode_lolocked(lo, inode, 1);
   2032    }
   2033
   2034out:
   2035    lo_inode_put(lo, &inode);
   2036    lo_inode_put(lo, &parent_inode);
   2037
   2038    if (err) {
   2039        if (fd >= 0) {
   2040            close(fd);
   2041        }
   2042
   2043        fuse_reply_err(req, err);
   2044    } else {
   2045        fuse_reply_create(req, &e, fi);
   2046    }
   2047}
   2048
   2049/* Should be called with inode->plock_mutex held */
   2050static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
   2051                                                      struct lo_inode *inode,
   2052                                                      uint64_t lock_owner,
   2053                                                      pid_t pid, int *err)
   2054{
   2055    struct lo_inode_plock *plock;
   2056    int fd;
   2057
   2058    plock =
   2059        g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
   2060
   2061    if (plock) {
   2062        return plock;
   2063    }
   2064
   2065    plock = malloc(sizeof(struct lo_inode_plock));
   2066    if (!plock) {
   2067        *err = ENOMEM;
   2068        return NULL;
   2069    }
   2070
   2071    /* Open another instance of file which can be used for ofd locks. */
   2072    /* TODO: What if file is not writable? */
   2073    fd = lo_inode_open(lo, inode, O_RDWR);
   2074    if (fd < 0) {
   2075        *err = -fd;
   2076        free(plock);
   2077        return NULL;
   2078    }
   2079
   2080    plock->lock_owner = lock_owner;
   2081    plock->fd = fd;
   2082    g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
   2083                        plock);
   2084    return plock;
   2085}
   2086
   2087static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
   2088                     struct flock *lock)
   2089{
   2090    struct lo_data *lo = lo_data(req);
   2091    struct lo_inode *inode;
   2092    struct lo_inode_plock *plock;
   2093    int ret, saverr = 0;
   2094
   2095    fuse_log(FUSE_LOG_DEBUG,
   2096             "lo_getlk(ino=%" PRIu64 ", flags=%d)"
   2097             " owner=0x%" PRIx64 ", l_type=%d l_start=0x%" PRIx64
   2098             " l_len=0x%" PRIx64 "\n",
   2099             ino, fi->flags, fi->lock_owner, lock->l_type,
   2100             (uint64_t)lock->l_start, (uint64_t)lock->l_len);
   2101
   2102    if (!lo->posix_lock) {
   2103        fuse_reply_err(req, ENOSYS);
   2104        return;
   2105    }
   2106
   2107    inode = lo_inode(req, ino);
   2108    if (!inode) {
   2109        fuse_reply_err(req, EBADF);
   2110        return;
   2111    }
   2112
   2113    pthread_mutex_lock(&inode->plock_mutex);
   2114    plock =
   2115        lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
   2116    if (!plock) {
   2117        saverr = ret;
   2118        goto out;
   2119    }
   2120
   2121    ret = fcntl(plock->fd, F_OFD_GETLK, lock);
   2122    if (ret == -1) {
   2123        saverr = errno;
   2124    }
   2125
   2126out:
   2127    pthread_mutex_unlock(&inode->plock_mutex);
   2128    lo_inode_put(lo, &inode);
   2129
   2130    if (saverr) {
   2131        fuse_reply_err(req, saverr);
   2132    } else {
   2133        fuse_reply_lock(req, lock);
   2134    }
   2135}
   2136
   2137static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
   2138                     struct flock *lock, int sleep)
   2139{
   2140    struct lo_data *lo = lo_data(req);
   2141    struct lo_inode *inode;
   2142    struct lo_inode_plock *plock;
   2143    int ret, saverr = 0;
   2144
   2145    fuse_log(FUSE_LOG_DEBUG,
   2146             "lo_setlk(ino=%" PRIu64 ", flags=%d)"
   2147             " cmd=%d pid=%d owner=0x%" PRIx64 " sleep=%d l_whence=%d"
   2148             " l_start=0x%" PRIx64 " l_len=0x%" PRIx64 "\n",
   2149             ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
   2150             lock->l_whence, (uint64_t)lock->l_start, (uint64_t)lock->l_len);
   2151
   2152    if (!lo->posix_lock) {
   2153        fuse_reply_err(req, ENOSYS);
   2154        return;
   2155    }
   2156
   2157    if (sleep) {
   2158        fuse_reply_err(req, EOPNOTSUPP);
   2159        return;
   2160    }
   2161
   2162    inode = lo_inode(req, ino);
   2163    if (!inode) {
   2164        fuse_reply_err(req, EBADF);
   2165        return;
   2166    }
   2167
   2168    pthread_mutex_lock(&inode->plock_mutex);
   2169    plock =
   2170        lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
   2171
   2172    if (!plock) {
   2173        saverr = ret;
   2174        goto out;
   2175    }
   2176
   2177    /* TODO: Is it alright to modify flock? */
   2178    lock->l_pid = 0;
   2179    ret = fcntl(plock->fd, F_OFD_SETLK, lock);
   2180    if (ret == -1) {
   2181        saverr = errno;
   2182    }
   2183
   2184out:
   2185    pthread_mutex_unlock(&inode->plock_mutex);
   2186    lo_inode_put(lo, &inode);
   2187
   2188    fuse_reply_err(req, saverr);
   2189}
   2190
   2191static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
   2192                        struct fuse_file_info *fi)
   2193{
   2194    int res;
   2195    struct lo_dirp *d;
   2196    int fd;
   2197
   2198    (void)ino;
   2199
   2200    d = lo_dirp(req, fi);
   2201    if (!d) {
   2202        fuse_reply_err(req, EBADF);
   2203        return;
   2204    }
   2205
   2206    fd = dirfd(d->dp);
   2207    if (datasync) {
   2208        res = fdatasync(fd);
   2209    } else {
   2210        res = fsync(fd);
   2211    }
   2212
   2213    lo_dirp_put(&d);
   2214
   2215    fuse_reply_err(req, res == -1 ? errno : 0);
   2216}
   2217
   2218static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
   2219{
   2220    struct lo_data *lo = lo_data(req);
   2221    struct lo_inode *inode = lo_inode(req, ino);
   2222    int err;
   2223
   2224    fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)"
   2225             "\n", ino, fi->flags, fi->kill_priv);
   2226
   2227    if (!inode) {
   2228        fuse_reply_err(req, EBADF);
   2229        return;
   2230    }
   2231
   2232    err = lo_do_open(lo, inode, -1, fi);
   2233    lo_inode_put(lo, &inode);
   2234    if (err) {
   2235        fuse_reply_err(req, err);
   2236    } else {
   2237        fuse_reply_open(req, fi);
   2238    }
   2239}
   2240
   2241static void lo_release(fuse_req_t req, fuse_ino_t ino,
   2242                       struct fuse_file_info *fi)
   2243{
   2244    struct lo_data *lo = lo_data(req);
   2245    struct lo_map_elem *elem;
   2246    int fd = -1;
   2247
   2248    (void)ino;
   2249
   2250    pthread_mutex_lock(&lo->mutex);
   2251    elem = lo_map_get(&lo->fd_map, fi->fh);
   2252    if (elem) {
   2253        fd = elem->fd;
   2254        elem = NULL;
   2255        lo_map_remove(&lo->fd_map, fi->fh);
   2256    }
   2257    pthread_mutex_unlock(&lo->mutex);
   2258
   2259    close(fd);
   2260    fuse_reply_err(req, 0);
   2261}
   2262
   2263static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
   2264{
   2265    int res;
   2266    (void)ino;
   2267    struct lo_inode *inode;
   2268    struct lo_data *lo = lo_data(req);
   2269
   2270    inode = lo_inode(req, ino);
   2271    if (!inode) {
   2272        fuse_reply_err(req, EBADF);
   2273        return;
   2274    }
   2275
   2276    if (!S_ISREG(inode->filetype)) {
   2277        lo_inode_put(lo, &inode);
   2278        fuse_reply_err(req, EBADF);
   2279        return;
   2280    }
   2281
   2282    /* An fd is going away. Cleanup associated posix locks */
   2283    if (lo->posix_lock) {
   2284        pthread_mutex_lock(&inode->plock_mutex);
   2285        g_hash_table_remove(inode->posix_locks,
   2286            GUINT_TO_POINTER(fi->lock_owner));
   2287        pthread_mutex_unlock(&inode->plock_mutex);
   2288    }
   2289    res = close(dup(lo_fi_fd(req, fi)));
   2290    lo_inode_put(lo, &inode);
   2291    fuse_reply_err(req, res == -1 ? errno : 0);
   2292}
   2293
   2294static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
   2295                     struct fuse_file_info *fi)
   2296{
   2297    struct lo_inode *inode = lo_inode(req, ino);
   2298    struct lo_data *lo = lo_data(req);
   2299    int res;
   2300    int fd;
   2301
   2302    fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
   2303             (void *)fi);
   2304
   2305    if (!inode) {
   2306        fuse_reply_err(req, EBADF);
   2307        return;
   2308    }
   2309
   2310    if (!fi) {
   2311        fd = lo_inode_open(lo, inode, O_RDWR);
   2312        if (fd < 0) {
   2313            res = -fd;
   2314            goto out;
   2315        }
   2316    } else {
   2317        fd = lo_fi_fd(req, fi);
   2318    }
   2319
   2320    if (datasync) {
   2321        res = fdatasync(fd) == -1 ? errno : 0;
   2322    } else {
   2323        res = fsync(fd) == -1 ? errno : 0;
   2324    }
   2325    if (!fi) {
   2326        close(fd);
   2327    }
   2328out:
   2329    lo_inode_put(lo, &inode);
   2330    fuse_reply_err(req, res);
   2331}
   2332
   2333static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
   2334                    struct fuse_file_info *fi)
   2335{
   2336    struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
   2337
   2338    fuse_log(FUSE_LOG_DEBUG,
   2339             "lo_read(ino=%" PRIu64 ", size=%zd, "
   2340             "off=%lu)\n",
   2341             ino, size, (unsigned long)offset);
   2342
   2343    buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
   2344    buf.buf[0].fd = lo_fi_fd(req, fi);
   2345    buf.buf[0].pos = offset;
   2346
   2347    fuse_reply_data(req, &buf);
   2348}
   2349
   2350static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
   2351                         struct fuse_bufvec *in_buf, off_t off,
   2352                         struct fuse_file_info *fi)
   2353{
   2354    (void)ino;
   2355    ssize_t res;
   2356    struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
   2357    bool cap_fsetid_dropped = false;
   2358
   2359    out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
   2360    out_buf.buf[0].fd = lo_fi_fd(req, fi);
   2361    out_buf.buf[0].pos = off;
   2362
   2363    fuse_log(FUSE_LOG_DEBUG,
   2364             "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n",
   2365             ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv);
   2366
   2367    res = drop_security_capability(lo_data(req), out_buf.buf[0].fd);
   2368    if (res) {
   2369        fuse_reply_err(req, res);
   2370        return;
   2371    }
   2372
   2373    /*
   2374     * If kill_priv is set, drop CAP_FSETID which should lead to kernel
   2375     * clearing setuid/setgid on file. Note, for WRITE, we need to do
   2376     * this even if killpriv_v2 is not enabled. fuse direct write path
   2377     * relies on this.
   2378     */
   2379    if (fi->kill_priv) {
   2380        res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
   2381        if (res != 0) {
   2382            fuse_reply_err(req, res);
   2383            return;
   2384        }
   2385    }
   2386
   2387    res = fuse_buf_copy(&out_buf, in_buf);
   2388    if (res < 0) {
   2389        fuse_reply_err(req, -res);
   2390    } else {
   2391        fuse_reply_write(req, (size_t)res);
   2392    }
   2393
   2394    if (cap_fsetid_dropped) {
   2395        res = gain_effective_cap("FSETID");
   2396        if (res) {
   2397            fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
   2398        }
   2399    }
   2400}
   2401
   2402static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
   2403{
   2404    int res;
   2405    struct statvfs stbuf;
   2406
   2407    res = fstatvfs(lo_fd(req, ino), &stbuf);
   2408    if (res == -1) {
   2409        fuse_reply_err(req, errno);
   2410    } else {
   2411        fuse_reply_statfs(req, &stbuf);
   2412    }
   2413}
   2414
   2415static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
   2416                         off_t length, struct fuse_file_info *fi)
   2417{
   2418    int err = EOPNOTSUPP;
   2419    (void)ino;
   2420
   2421#ifdef CONFIG_FALLOCATE
   2422    err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
   2423    if (err < 0) {
   2424        err = errno;
   2425    }
   2426
   2427#elif defined(CONFIG_POSIX_FALLOCATE)
   2428    if (mode) {
   2429        fuse_reply_err(req, EOPNOTSUPP);
   2430        return;
   2431    }
   2432
   2433    err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
   2434#endif
   2435
   2436    fuse_reply_err(req, err);
   2437}
   2438
   2439static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
   2440                     int op)
   2441{
   2442    int res;
   2443    (void)ino;
   2444
   2445    res = flock(lo_fi_fd(req, fi), op);
   2446
   2447    fuse_reply_err(req, res == -1 ? errno : 0);
   2448}
   2449
   2450/* types */
   2451/*
   2452 * Exit; process attribute unmodified if matched.
   2453 * An empty key applies to all.
   2454 */
   2455#define XATTR_MAP_FLAG_OK      (1 <<  0)
   2456/*
   2457 * The attribute is unwanted;
   2458 * EPERM on write, hidden on read.
   2459 */
   2460#define XATTR_MAP_FLAG_BAD     (1 <<  1)
   2461/*
   2462 * For attr that start with 'key' prepend 'prepend'
   2463 * 'key' may be empty to prepend for all attrs
   2464 * key is defined from set/remove point of view.
   2465 * Automatically reversed on read
   2466 */
   2467#define XATTR_MAP_FLAG_PREFIX  (1 <<  2)
   2468
   2469/* scopes */
   2470/* Apply rule to get/set/remove */
   2471#define XATTR_MAP_FLAG_CLIENT  (1 << 16)
   2472/* Apply rule to list */
   2473#define XATTR_MAP_FLAG_SERVER  (1 << 17)
   2474/* Apply rule to all */
   2475#define XATTR_MAP_FLAG_ALL   (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
   2476
   2477static void add_xattrmap_entry(struct lo_data *lo,
   2478                               const XattrMapEntry *new_entry)
   2479{
   2480    XattrMapEntry *res = g_realloc_n(lo->xattr_map_list,
   2481                                     lo->xattr_map_nentries + 1,
   2482                                     sizeof(XattrMapEntry));
   2483    res[lo->xattr_map_nentries++] = *new_entry;
   2484
   2485    lo->xattr_map_list = res;
   2486}
   2487
   2488static void free_xattrmap(struct lo_data *lo)
   2489{
   2490    XattrMapEntry *map = lo->xattr_map_list;
   2491    size_t i;
   2492
   2493    if (!map) {
   2494        return;
   2495    }
   2496
   2497    for (i = 0; i < lo->xattr_map_nentries; i++) {
   2498        g_free(map[i].key);
   2499        g_free(map[i].prepend);
   2500    };
   2501
   2502    g_free(map);
   2503    lo->xattr_map_list = NULL;
   2504    lo->xattr_map_nentries = -1;
   2505}
   2506
   2507/*
   2508 * Handle the 'map' type, which is sugar for a set of commands
   2509 * for the common case of prefixing a subset or everything,
   2510 * and allowing anything not prefixed through.
   2511 * It must be the last entry in the stream, although there
   2512 * can be other entries before it.
   2513 * The form is:
   2514 *    :map:key:prefix:
   2515 *
   2516 * key maybe empty in which case all entries are prefixed.
   2517 */
   2518static void parse_xattrmap_map(struct lo_data *lo,
   2519                               const char *rule, char sep)
   2520{
   2521    const char *tmp;
   2522    char *key;
   2523    char *prefix;
   2524    XattrMapEntry tmp_entry;
   2525
   2526    if (*rule != sep) {
   2527        fuse_log(FUSE_LOG_ERR,
   2528                 "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
   2529                 __func__, sep, *rule);
   2530        exit(1);
   2531    }
   2532
   2533    rule++;
   2534
   2535    /* At start of 'key' field */
   2536    tmp = strchr(rule, sep);
   2537    if (!tmp) {
   2538        fuse_log(FUSE_LOG_ERR,
   2539                 "%s: Missing '%c' at end of key field in map rule\n",
   2540                 __func__, sep);
   2541        exit(1);
   2542    }
   2543
   2544    key = g_strndup(rule, tmp - rule);
   2545    rule = tmp + 1;
   2546
   2547    /* At start of prefix field */
   2548    tmp = strchr(rule, sep);
   2549    if (!tmp) {
   2550        fuse_log(FUSE_LOG_ERR,
   2551                 "%s: Missing '%c' at end of prefix field in map rule\n",
   2552                 __func__, sep);
   2553        exit(1);
   2554    }
   2555
   2556    prefix = g_strndup(rule, tmp - rule);
   2557    rule = tmp + 1;
   2558
   2559    /*
   2560     * This should be the end of the string, we don't allow
   2561     * any more commands after 'map'.
   2562     */
   2563    if (*rule) {
   2564        fuse_log(FUSE_LOG_ERR,
   2565                 "%s: Expecting end of command after map, found '%c'\n",
   2566                 __func__, *rule);
   2567        exit(1);
   2568    }
   2569
   2570    /* 1st: Prefix matches/everything */
   2571    tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL;
   2572    tmp_entry.key = g_strdup(key);
   2573    tmp_entry.prepend = g_strdup(prefix);
   2574    add_xattrmap_entry(lo, &tmp_entry);
   2575
   2576    if (!*key) {
   2577        /* Prefix all case */
   2578
   2579        /* 2nd: Hide any non-prefixed entries on the host */
   2580        tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL;
   2581        tmp_entry.key = g_strdup("");
   2582        tmp_entry.prepend = g_strdup("");
   2583        add_xattrmap_entry(lo, &tmp_entry);
   2584    } else {
   2585        /* Prefix matching case */
   2586
   2587        /* 2nd: Hide non-prefixed but matching entries on the host */
   2588        tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER;
   2589        tmp_entry.key = g_strdup(""); /* Not used */
   2590        tmp_entry.prepend = g_strdup(key);
   2591        add_xattrmap_entry(lo, &tmp_entry);
   2592
   2593        /* 3rd: Stop the client accessing prefixed attributes directly */
   2594        tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT;
   2595        tmp_entry.key = g_strdup(prefix);
   2596        tmp_entry.prepend = g_strdup(""); /* Not used */
   2597        add_xattrmap_entry(lo, &tmp_entry);
   2598
   2599        /* 4th: Everything else is OK */
   2600        tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL;
   2601        tmp_entry.key = g_strdup("");
   2602        tmp_entry.prepend = g_strdup("");
   2603        add_xattrmap_entry(lo, &tmp_entry);
   2604    }
   2605
   2606    g_free(key);
   2607    g_free(prefix);
   2608}
   2609
   2610static void parse_xattrmap(struct lo_data *lo)
   2611{
   2612    const char *map = lo->xattrmap;
   2613    const char *tmp;
   2614    int ret;
   2615
   2616    lo->xattr_map_nentries = 0;
   2617    while (*map) {
   2618        XattrMapEntry tmp_entry;
   2619        char sep;
   2620
   2621        if (isspace(*map)) {
   2622            map++;
   2623            continue;
   2624        }
   2625        /* The separator is the first non-space of the rule */
   2626        sep = *map++;
   2627        if (!sep) {
   2628            break;
   2629        }
   2630
   2631        tmp_entry.flags = 0;
   2632        /* Start of 'type' */
   2633        if (strstart(map, "prefix", &map)) {
   2634            tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX;
   2635        } else if (strstart(map, "ok", &map)) {
   2636            tmp_entry.flags |= XATTR_MAP_FLAG_OK;
   2637        } else if (strstart(map, "bad", &map)) {
   2638            tmp_entry.flags |= XATTR_MAP_FLAG_BAD;
   2639        } else if (strstart(map, "map", &map)) {
   2640            /*
   2641             * map is sugar that adds a number of rules, and must be
   2642             * the last entry.
   2643             */
   2644            parse_xattrmap_map(lo, map, sep);
   2645            break;
   2646        } else {
   2647            fuse_log(FUSE_LOG_ERR,
   2648                     "%s: Unexpected type;"
   2649                     "Expecting 'prefix', 'ok', 'bad' or 'map' in rule %zu\n",
   2650                     __func__, lo->xattr_map_nentries);
   2651            exit(1);
   2652        }
   2653
   2654        if (*map++ != sep) {
   2655            fuse_log(FUSE_LOG_ERR,
   2656                     "%s: Missing '%c' at end of type field of rule %zu\n",
   2657                     __func__, sep, lo->xattr_map_nentries);
   2658            exit(1);
   2659        }
   2660
   2661        /* Start of 'scope' */
   2662        if (strstart(map, "client", &map)) {
   2663            tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT;
   2664        } else if (strstart(map, "server", &map)) {
   2665            tmp_entry.flags |= XATTR_MAP_FLAG_SERVER;
   2666        } else if (strstart(map, "all", &map)) {
   2667            tmp_entry.flags |= XATTR_MAP_FLAG_ALL;
   2668        } else {
   2669            fuse_log(FUSE_LOG_ERR,
   2670                     "%s: Unexpected scope;"
   2671                     " Expecting 'client', 'server', or 'all', in rule %zu\n",
   2672                     __func__, lo->xattr_map_nentries);
   2673            exit(1);
   2674        }
   2675
   2676        if (*map++ != sep) {
   2677            fuse_log(FUSE_LOG_ERR,
   2678                     "%s: Expecting '%c' found '%c'"
   2679                     " after scope in rule %zu\n",
   2680                     __func__, sep, *map, lo->xattr_map_nentries);
   2681            exit(1);
   2682        }
   2683
   2684        /* At start of 'key' field */
   2685        tmp = strchr(map, sep);
   2686        if (!tmp) {
   2687            fuse_log(FUSE_LOG_ERR,
   2688                     "%s: Missing '%c' at end of key field of rule %zu",
   2689                     __func__, sep, lo->xattr_map_nentries);
   2690            exit(1);
   2691        }
   2692        tmp_entry.key = g_strndup(map, tmp - map);
   2693        map = tmp + 1;
   2694
   2695        /* At start of 'prepend' field */
   2696        tmp = strchr(map, sep);
   2697        if (!tmp) {
   2698            fuse_log(FUSE_LOG_ERR,
   2699                     "%s: Missing '%c' at end of prepend field of rule %zu",
   2700                     __func__, sep, lo->xattr_map_nentries);
   2701            exit(1);
   2702        }
   2703        tmp_entry.prepend = g_strndup(map, tmp - map);
   2704        map = tmp + 1;
   2705
   2706        add_xattrmap_entry(lo, &tmp_entry);
   2707        /* End of rule - go around again for another rule */
   2708    }
   2709
   2710    if (!lo->xattr_map_nentries) {
   2711        fuse_log(FUSE_LOG_ERR, "Empty xattr map\n");
   2712        exit(1);
   2713    }
   2714
   2715    ret = xattr_map_client(lo, "security.capability",
   2716                           &lo->xattr_security_capability);
   2717    if (ret) {
   2718        fuse_log(FUSE_LOG_ERR, "Failed to map security.capability: %s\n",
   2719                strerror(ret));
   2720        exit(1);
   2721    }
   2722    if (!lo->xattr_security_capability ||
   2723        !strcmp(lo->xattr_security_capability, "security.capability")) {
   2724        /* 1-1 mapping, don't need to do anything */
   2725        free(lo->xattr_security_capability);
   2726        lo->xattr_security_capability = NULL;
   2727    }
   2728}
   2729
   2730/*
   2731 * For use with getxattr/setxattr/removexattr, where the client
   2732 * gives us a name and we may need to choose a different one.
   2733 * Allocates a buffer for the result placing it in *out_name.
   2734 *   If there's no change then *out_name is not set.
   2735 * Returns 0 on success
   2736 * Can return -EPERM to indicate we block a given attribute
   2737 *   (in which case out_name is not allocated)
   2738 * Can return -ENOMEM to indicate out_name couldn't be allocated.
   2739 */
   2740static int xattr_map_client(const struct lo_data *lo, const char *client_name,
   2741                            char **out_name)
   2742{
   2743    size_t i;
   2744    for (i = 0; i < lo->xattr_map_nentries; i++) {
   2745        const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
   2746
   2747        if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) &&
   2748            (strstart(client_name, cur_entry->key, NULL))) {
   2749            if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
   2750                return -EPERM;
   2751            }
   2752            if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
   2753                /* Unmodified name */
   2754                return 0;
   2755            }
   2756            if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
   2757                *out_name = g_try_malloc(strlen(client_name) +
   2758                                         strlen(cur_entry->prepend) + 1);
   2759                if (!*out_name) {
   2760                    return -ENOMEM;
   2761                }
   2762                sprintf(*out_name, "%s%s", cur_entry->prepend, client_name);
   2763                return 0;
   2764            }
   2765        }
   2766    }
   2767
   2768    return -EPERM;
   2769}
   2770
   2771/*
   2772 * For use with listxattr where the server fs gives us a name and we may need
   2773 * to sanitize this for the client.
   2774 * Returns a pointer to the result in *out_name
   2775 *   This is always the original string or the current string with some prefix
   2776 *   removed; no reallocation is done.
   2777 * Returns 0 on success
   2778 * Can return -ENODATA to indicate the name should be dropped from the list.
   2779 */
   2780static int xattr_map_server(const struct lo_data *lo, const char *server_name,
   2781                            const char **out_name)
   2782{
   2783    size_t i;
   2784    const char *end;
   2785
   2786    for (i = 0; i < lo->xattr_map_nentries; i++) {
   2787        const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
   2788
   2789        if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) &&
   2790            (strstart(server_name, cur_entry->prepend, &end))) {
   2791            if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
   2792                return -ENODATA;
   2793            }
   2794            if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
   2795                *out_name = server_name;
   2796                return 0;
   2797            }
   2798            if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
   2799                /* Remove prefix */
   2800                *out_name = end;
   2801                return 0;
   2802            }
   2803        }
   2804    }
   2805
   2806    return -ENODATA;
   2807}
   2808
   2809#define FCHDIR_NOFAIL(fd) do {                         \
   2810        int fchdir_res = fchdir(fd);                   \
   2811        assert(fchdir_res == 0);                       \
   2812    } while (0)
   2813
   2814static bool block_xattr(struct lo_data *lo, const char *name)
   2815{
   2816    /*
   2817     * If user explicitly enabled posix_acl or did not provide any option,
   2818     * do not block acl. Otherwise block system.posix_acl_access and
   2819     * system.posix_acl_default xattrs.
   2820     */
   2821    if (lo->user_posix_acl) {
   2822        return false;
   2823    }
   2824    if (!strcmp(name, "system.posix_acl_access") ||
   2825        !strcmp(name, "system.posix_acl_default"))
   2826            return true;
   2827
   2828    return false;
   2829}
   2830
   2831/*
   2832 * Returns number of bytes in xattr_list after filtering on success. This
   2833 * could be zero as well if nothing is left after filtering.
   2834 *
   2835 * Returns negative error code on failure.
   2836 * xattr_list is modified in place.
   2837 */
   2838static int remove_blocked_xattrs(struct lo_data *lo, char *xattr_list,
   2839                                 unsigned in_size)
   2840{
   2841    size_t out_index, in_index;
   2842
   2843    /*
   2844     * As of now we only filter out acl xattrs. If acls are enabled or
   2845     * they have not been explicitly disabled, there is nothing to
   2846     * filter.
   2847     */
   2848    if (lo->user_posix_acl) {
   2849        return in_size;
   2850    }
   2851
   2852    out_index = 0;
   2853    in_index = 0;
   2854    while (in_index < in_size) {
   2855        char *in_ptr = xattr_list + in_index;
   2856
   2857        /* Length of current attribute name */
   2858        size_t in_len = strlen(xattr_list + in_index) + 1;
   2859
   2860        if (!block_xattr(lo, in_ptr)) {
   2861            if (in_index != out_index) {
   2862                memmove(xattr_list + out_index, xattr_list + in_index, in_len);
   2863            }
   2864            out_index += in_len;
   2865        }
   2866        in_index += in_len;
   2867     }
   2868    return out_index;
   2869}
   2870
   2871static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
   2872                        size_t size)
   2873{
   2874    struct lo_data *lo = lo_data(req);
   2875    g_autofree char *value = NULL;
   2876    char procname[64];
   2877    const char *name;
   2878    char *mapped_name;
   2879    struct lo_inode *inode;
   2880    ssize_t ret;
   2881    int saverr;
   2882    int fd = -1;
   2883
   2884    if (block_xattr(lo, in_name)) {
   2885        fuse_reply_err(req, EOPNOTSUPP);
   2886        return;
   2887    }
   2888
   2889    mapped_name = NULL;
   2890    name = in_name;
   2891    if (lo->xattrmap) {
   2892        ret = xattr_map_client(lo, in_name, &mapped_name);
   2893        if (ret < 0) {
   2894            if (ret == -EPERM) {
   2895                ret = -ENODATA;
   2896            }
   2897            fuse_reply_err(req, -ret);
   2898            return;
   2899        }
   2900        if (mapped_name) {
   2901            name = mapped_name;
   2902        }
   2903    }
   2904
   2905    inode = lo_inode(req, ino);
   2906    if (!inode) {
   2907        fuse_reply_err(req, EBADF);
   2908        g_free(mapped_name);
   2909        return;
   2910    }
   2911
   2912    saverr = ENOSYS;
   2913    if (!lo_data(req)->xattr) {
   2914        goto out;
   2915    }
   2916
   2917    fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
   2918             ino, name, size);
   2919
   2920    if (size) {
   2921        value = g_try_malloc(size);
   2922        if (!value) {
   2923            goto out_err;
   2924        }
   2925    }
   2926
   2927    sprintf(procname, "%i", inode->fd);
   2928    /*
   2929     * It is not safe to open() non-regular/non-dir files in file server
   2930     * unless O_PATH is used, so use that method for regular files/dir
   2931     * only (as it seems giving less performance overhead).
   2932     * Otherwise, call fchdir() to avoid open().
   2933     */
   2934    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
   2935        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
   2936        if (fd < 0) {
   2937            goto out_err;
   2938        }
   2939        ret = fgetxattr(fd, name, value, size);
   2940        saverr = ret == -1 ? errno : 0;
   2941    } else {
   2942        /* fchdir should not fail here */
   2943        FCHDIR_NOFAIL(lo->proc_self_fd);
   2944        ret = getxattr(procname, name, value, size);
   2945        saverr = ret == -1 ? errno : 0;
   2946        FCHDIR_NOFAIL(lo->root.fd);
   2947    }
   2948
   2949    if (ret == -1) {
   2950        goto out;
   2951    }
   2952    if (size) {
   2953        saverr = 0;
   2954        if (ret == 0) {
   2955            goto out;
   2956        }
   2957        fuse_reply_buf(req, value, ret);
   2958    } else {
   2959        fuse_reply_xattr(req, ret);
   2960    }
   2961out_free:
   2962    if (fd >= 0) {
   2963        close(fd);
   2964    }
   2965
   2966    lo_inode_put(lo, &inode);
   2967    return;
   2968
   2969out_err:
   2970    saverr = errno;
   2971out:
   2972    fuse_reply_err(req, saverr);
   2973    g_free(mapped_name);
   2974    goto out_free;
   2975}
   2976
   2977static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
   2978{
   2979    struct lo_data *lo = lo_data(req);
   2980    g_autofree char *value = NULL;
   2981    char procname[64];
   2982    struct lo_inode *inode;
   2983    ssize_t ret;
   2984    int saverr;
   2985    int fd = -1;
   2986
   2987    inode = lo_inode(req, ino);
   2988    if (!inode) {
   2989        fuse_reply_err(req, EBADF);
   2990        return;
   2991    }
   2992
   2993    saverr = ENOSYS;
   2994    if (!lo_data(req)->xattr) {
   2995        goto out;
   2996    }
   2997
   2998    fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
   2999             size);
   3000
   3001    if (size) {
   3002        value = g_try_malloc(size);
   3003        if (!value) {
   3004            goto out_err;
   3005        }
   3006    }
   3007
   3008    sprintf(procname, "%i", inode->fd);
   3009    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
   3010        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
   3011        if (fd < 0) {
   3012            goto out_err;
   3013        }
   3014        ret = flistxattr(fd, value, size);
   3015        saverr = ret == -1 ? errno : 0;
   3016    } else {
   3017        /* fchdir should not fail here */
   3018        FCHDIR_NOFAIL(lo->proc_self_fd);
   3019        ret = listxattr(procname, value, size);
   3020        saverr = ret == -1 ? errno : 0;
   3021        FCHDIR_NOFAIL(lo->root.fd);
   3022    }
   3023
   3024    if (ret == -1) {
   3025        goto out;
   3026    }
   3027    if (size) {
   3028        saverr = 0;
   3029        if (ret == 0) {
   3030            goto out;
   3031        }
   3032
   3033        if (lo->xattr_map_list) {
   3034            /*
   3035             * Map the names back, some attributes might be dropped,
   3036             * some shortened, but not increased, so we shouldn't
   3037             * run out of room.
   3038             */
   3039            size_t out_index, in_index;
   3040            out_index = 0;
   3041            in_index = 0;
   3042            while (in_index < ret) {
   3043                const char *map_out;
   3044                char *in_ptr = value + in_index;
   3045                /* Length of current attribute name */
   3046                size_t in_len = strlen(value + in_index) + 1;
   3047
   3048                int mapret = xattr_map_server(lo, in_ptr, &map_out);
   3049                if (mapret != -ENODATA && mapret != 0) {
   3050                    /* Shouldn't happen */
   3051                    saverr = -mapret;
   3052                    goto out;
   3053                }
   3054                if (mapret == 0) {
   3055                    /* Either unchanged, or truncated */
   3056                    size_t out_len;
   3057                    if (map_out != in_ptr) {
   3058                        /* +1 copies the NIL */
   3059                        out_len = strlen(map_out) + 1;
   3060                    } else {
   3061                        /* No change */
   3062                        out_len = in_len;
   3063                    }
   3064                    /*
   3065                     * Move result along, may still be needed for an unchanged
   3066                     * entry if a previous entry was changed.
   3067                     */
   3068                    memmove(value + out_index, map_out, out_len);
   3069
   3070                    out_index += out_len;
   3071                }
   3072                in_index += in_len;
   3073            }
   3074            ret = out_index;
   3075            if (ret == 0) {
   3076                goto out;
   3077            }
   3078        }
   3079
   3080        ret = remove_blocked_xattrs(lo, value, ret);
   3081        if (ret <= 0) {
   3082            saverr = -ret;
   3083            goto out;
   3084        }
   3085        fuse_reply_buf(req, value, ret);
   3086    } else {
   3087        /*
   3088         * xattrmap only ever shortens the result,
   3089         * so we don't need to do anything clever with the
   3090         * allocation length here.
   3091         */
   3092        fuse_reply_xattr(req, ret);
   3093    }
   3094out_free:
   3095    if (fd >= 0) {
   3096        close(fd);
   3097    }
   3098
   3099    lo_inode_put(lo, &inode);
   3100    return;
   3101
   3102out_err:
   3103    saverr = errno;
   3104out:
   3105    fuse_reply_err(req, saverr);
   3106    goto out_free;
   3107}
   3108
   3109static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
   3110                        const char *value, size_t size, int flags,
   3111                        uint32_t extra_flags)
   3112{
   3113    char procname[64];
   3114    const char *name;
   3115    char *mapped_name;
   3116    struct lo_data *lo = lo_data(req);
   3117    struct lo_inode *inode;
   3118    ssize_t ret;
   3119    int saverr;
   3120    int fd = -1;
   3121    bool switched_creds = false;
   3122    bool cap_fsetid_dropped = false;
   3123    struct lo_cred old = {};
   3124
   3125    if (block_xattr(lo, in_name)) {
   3126        fuse_reply_err(req, EOPNOTSUPP);
   3127        return;
   3128    }
   3129
   3130    mapped_name = NULL;
   3131    name = in_name;
   3132    if (lo->xattrmap) {
   3133        ret = xattr_map_client(lo, in_name, &mapped_name);
   3134        if (ret < 0) {
   3135            fuse_reply_err(req, -ret);
   3136            return;
   3137        }
   3138        if (mapped_name) {
   3139            name = mapped_name;
   3140        }
   3141    }
   3142
   3143    inode = lo_inode(req, ino);
   3144    if (!inode) {
   3145        fuse_reply_err(req, EBADF);
   3146        g_free(mapped_name);
   3147        return;
   3148    }
   3149
   3150    saverr = ENOSYS;
   3151    if (!lo_data(req)->xattr) {
   3152        goto out;
   3153    }
   3154
   3155    fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
   3156             ", name=%s value=%s size=%zd)\n", ino, name, value, size);
   3157
   3158    sprintf(procname, "%i", inode->fd);
   3159    /*
   3160     * If we are setting posix access acl and if SGID needs to be
   3161     * cleared, then switch to caller's gid and drop CAP_FSETID
   3162     * and that should make sure host kernel clears SGID.
   3163     *
   3164     * This probably will not work when we support idmapped mounts.
   3165     * In that case we will need to find a non-root gid and switch
   3166     * to it. (Instead of gid in request). Fix it when we support
   3167     * idmapped mounts.
   3168     */
   3169    if (lo->posix_acl && !strcmp(name, "system.posix_acl_access")
   3170        && (extra_flags & FUSE_SETXATTR_ACL_KILL_SGID)) {
   3171        ret = lo_drop_cap_change_cred(req, &old, false, "FSETID",
   3172                                      &cap_fsetid_dropped);
   3173        if (ret) {
   3174            saverr = ret;
   3175            goto out;
   3176        }
   3177        switched_creds = true;
   3178    }
   3179    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
   3180        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
   3181        if (fd < 0) {
   3182            saverr = errno;
   3183            goto out;
   3184        }
   3185        ret = fsetxattr(fd, name, value, size, flags);
   3186        saverr = ret == -1 ? errno : 0;
   3187    } else {
   3188        /* fchdir should not fail here */
   3189        FCHDIR_NOFAIL(lo->proc_self_fd);
   3190        ret = setxattr(procname, name, value, size, flags);
   3191        saverr = ret == -1 ? errno : 0;
   3192        FCHDIR_NOFAIL(lo->root.fd);
   3193    }
   3194    if (switched_creds) {
   3195        if (cap_fsetid_dropped)
   3196            lo_restore_cred_gain_cap(&old, false, "FSETID");
   3197        else
   3198            lo_restore_cred(&old, false);
   3199    }
   3200
   3201out:
   3202    if (fd >= 0) {
   3203        close(fd);
   3204    }
   3205
   3206    lo_inode_put(lo, &inode);
   3207    g_free(mapped_name);
   3208    fuse_reply_err(req, saverr);
   3209}
   3210
   3211static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
   3212{
   3213    char procname[64];
   3214    const char *name;
   3215    char *mapped_name;
   3216    struct lo_data *lo = lo_data(req);
   3217    struct lo_inode *inode;
   3218    ssize_t ret;
   3219    int saverr;
   3220    int fd = -1;
   3221
   3222    if (block_xattr(lo, in_name)) {
   3223        fuse_reply_err(req, EOPNOTSUPP);
   3224        return;
   3225    }
   3226
   3227    mapped_name = NULL;
   3228    name = in_name;
   3229    if (lo->xattrmap) {
   3230        ret = xattr_map_client(lo, in_name, &mapped_name);
   3231        if (ret < 0) {
   3232            fuse_reply_err(req, -ret);
   3233            return;
   3234        }
   3235        if (mapped_name) {
   3236            name = mapped_name;
   3237        }
   3238    }
   3239
   3240    inode = lo_inode(req, ino);
   3241    if (!inode) {
   3242        fuse_reply_err(req, EBADF);
   3243        g_free(mapped_name);
   3244        return;
   3245    }
   3246
   3247    saverr = ENOSYS;
   3248    if (!lo_data(req)->xattr) {
   3249        goto out;
   3250    }
   3251
   3252    fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
   3253             name);
   3254
   3255    sprintf(procname, "%i", inode->fd);
   3256    if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
   3257        fd = openat(lo->proc_self_fd, procname, O_RDONLY);
   3258        if (fd < 0) {
   3259            saverr = errno;
   3260            goto out;
   3261        }
   3262        ret = fremovexattr(fd, name);
   3263        saverr = ret == -1 ? errno : 0;
   3264    } else {
   3265        /* fchdir should not fail here */
   3266        FCHDIR_NOFAIL(lo->proc_self_fd);
   3267        ret = removexattr(procname, name);
   3268        saverr = ret == -1 ? errno : 0;
   3269        FCHDIR_NOFAIL(lo->root.fd);
   3270    }
   3271
   3272out:
   3273    if (fd >= 0) {
   3274        close(fd);
   3275    }
   3276
   3277    lo_inode_put(lo, &inode);
   3278    g_free(mapped_name);
   3279    fuse_reply_err(req, saverr);
   3280}
   3281
   3282#ifdef HAVE_COPY_FILE_RANGE
   3283static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
   3284                               struct fuse_file_info *fi_in, fuse_ino_t ino_out,
   3285                               off_t off_out, struct fuse_file_info *fi_out,
   3286                               size_t len, int flags)
   3287{
   3288    int in_fd, out_fd;
   3289    ssize_t res;
   3290
   3291    in_fd = lo_fi_fd(req, fi_in);
   3292    out_fd = lo_fi_fd(req, fi_out);
   3293
   3294    fuse_log(FUSE_LOG_DEBUG,
   3295             "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
   3296             "off=%ju, ino=%" PRIu64 "/fd=%d, "
   3297             "off=%ju, size=%zd, flags=0x%x)\n",
   3298             ino_in, in_fd, (intmax_t)off_in,
   3299             ino_out, out_fd, (intmax_t)off_out, len, flags);
   3300
   3301    res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
   3302    if (res < 0) {
   3303        fuse_reply_err(req, errno);
   3304    } else {
   3305        fuse_reply_write(req, res);
   3306    }
   3307}
   3308#endif
   3309
   3310static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
   3311                     struct fuse_file_info *fi)
   3312{
   3313    off_t res;
   3314
   3315    (void)ino;
   3316    res = lseek(lo_fi_fd(req, fi), off, whence);
   3317    if (res != -1) {
   3318        fuse_reply_lseek(req, res);
   3319    } else {
   3320        fuse_reply_err(req, errno);
   3321    }
   3322}
   3323
   3324static void lo_destroy(void *userdata)
   3325{
   3326    struct lo_data *lo = (struct lo_data *)userdata;
   3327
   3328    pthread_mutex_lock(&lo->mutex);
   3329    while (true) {
   3330        GHashTableIter iter;
   3331        gpointer key, value;
   3332
   3333        g_hash_table_iter_init(&iter, lo->inodes);
   3334        if (!g_hash_table_iter_next(&iter, &key, &value)) {
   3335            break;
   3336        }
   3337
   3338        struct lo_inode *inode = value;
   3339        unref_inode(lo, inode, inode->nlookup);
   3340    }
   3341    pthread_mutex_unlock(&lo->mutex);
   3342}
   3343
   3344static struct fuse_lowlevel_ops lo_oper = {
   3345    .init = lo_init,
   3346    .lookup = lo_lookup,
   3347    .mkdir = lo_mkdir,
   3348    .mknod = lo_mknod,
   3349    .symlink = lo_symlink,
   3350    .link = lo_link,
   3351    .unlink = lo_unlink,
   3352    .rmdir = lo_rmdir,
   3353    .rename = lo_rename,
   3354    .forget = lo_forget,
   3355    .forget_multi = lo_forget_multi,
   3356    .getattr = lo_getattr,
   3357    .setattr = lo_setattr,
   3358    .readlink = lo_readlink,
   3359    .opendir = lo_opendir,
   3360    .readdir = lo_readdir,
   3361    .readdirplus = lo_readdirplus,
   3362    .releasedir = lo_releasedir,
   3363    .fsyncdir = lo_fsyncdir,
   3364    .create = lo_create,
   3365    .getlk = lo_getlk,
   3366    .setlk = lo_setlk,
   3367    .open = lo_open,
   3368    .release = lo_release,
   3369    .flush = lo_flush,
   3370    .fsync = lo_fsync,
   3371    .read = lo_read,
   3372    .write_buf = lo_write_buf,
   3373    .statfs = lo_statfs,
   3374    .fallocate = lo_fallocate,
   3375    .flock = lo_flock,
   3376    .getxattr = lo_getxattr,
   3377    .listxattr = lo_listxattr,
   3378    .setxattr = lo_setxattr,
   3379    .removexattr = lo_removexattr,
   3380#ifdef HAVE_COPY_FILE_RANGE
   3381    .copy_file_range = lo_copy_file_range,
   3382#endif
   3383    .lseek = lo_lseek,
   3384    .destroy = lo_destroy,
   3385};
   3386
   3387/* Print vhost-user.json backend program capabilities */
   3388static void print_capabilities(void)
   3389{
   3390    printf("{\n");
   3391    printf("  \"type\": \"fs\"\n");
   3392    printf("}\n");
   3393}
   3394
   3395/*
   3396 * Drop all Linux capabilities because the wait parent process only needs to
   3397 * sit in waitpid(2) and terminate.
   3398 */
   3399static void setup_wait_parent_capabilities(void)
   3400{
   3401    capng_setpid(syscall(SYS_gettid));
   3402    capng_clear(CAPNG_SELECT_BOTH);
   3403    capng_apply(CAPNG_SELECT_BOTH);
   3404}
   3405
   3406/*
   3407 * Move to a new mount, net, and pid namespaces to isolate this process.
   3408 */
   3409static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
   3410{
   3411    pid_t child;
   3412
   3413    /*
   3414     * Create a new pid namespace for *child* processes.  We'll have to
   3415     * fork in order to enter the new pid namespace.  A new mount namespace
   3416     * is also needed so that we can remount /proc for the new pid
   3417     * namespace.
   3418     *
   3419     * Our UNIX domain sockets have been created.  Now we can move to
   3420     * an empty network namespace to prevent TCP/IP and other network
   3421     * activity in case this process is compromised.
   3422     */
   3423    if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
   3424        fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
   3425        exit(1);
   3426    }
   3427
   3428    child = fork();
   3429    if (child < 0) {
   3430        fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
   3431        exit(1);
   3432    }
   3433    if (child > 0) {
   3434        pid_t waited;
   3435        int wstatus;
   3436
   3437        setup_wait_parent_capabilities();
   3438
   3439        /* The parent waits for the child */
   3440        do {
   3441            waited = waitpid(child, &wstatus, 0);
   3442        } while (waited < 0 && errno == EINTR && !se->exited);
   3443
   3444        /* We were terminated by a signal, see fuse_signals.c */
   3445        if (se->exited) {
   3446            exit(0);
   3447        }
   3448
   3449        if (WIFEXITED(wstatus)) {
   3450            exit(WEXITSTATUS(wstatus));
   3451        }
   3452
   3453        exit(1);
   3454    }
   3455
   3456    /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
   3457    prctl(PR_SET_PDEATHSIG, SIGTERM);
   3458
   3459    /*
   3460     * If the mounts have shared propagation then we want to opt out so our
   3461     * mount changes don't affect the parent mount namespace.
   3462     */
   3463    if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
   3464        fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
   3465        exit(1);
   3466    }
   3467
   3468    /* The child must remount /proc to use the new pid namespace */
   3469    if (mount("proc", "/proc", "proc",
   3470              MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
   3471        fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
   3472        exit(1);
   3473    }
   3474
   3475    /*
   3476     * We only need /proc/self/fd. Prevent ".." from accessing parent
   3477     * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
   3478     * previously remounted with MS_REC | MS_SLAVE this mount change only
   3479     * affects our process.
   3480     */
   3481    if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) {
   3482        fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n");
   3483        exit(1);
   3484    }
   3485
   3486    /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
   3487    lo->proc_self_fd = open("/proc", O_PATH);
   3488    if (lo->proc_self_fd == -1) {
   3489        fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n");
   3490        exit(1);
   3491    }
   3492}
   3493
   3494/*
   3495 * Capture the capability state, we'll need to restore this for individual
   3496 * threads later; see load_capng.
   3497 */
   3498static void setup_capng(void)
   3499{
   3500    /* Note this accesses /proc so has to happen before the sandbox */
   3501    if (capng_get_caps_process()) {
   3502        fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
   3503        exit(1);
   3504    }
   3505    pthread_mutex_init(&cap.mutex, NULL);
   3506    pthread_mutex_lock(&cap.mutex);
   3507    cap.saved = capng_save_state();
   3508    if (!cap.saved) {
   3509        fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
   3510        exit(1);
   3511    }
   3512    pthread_mutex_unlock(&cap.mutex);
   3513}
   3514
   3515static void cleanup_capng(void)
   3516{
   3517    free(cap.saved);
   3518    cap.saved = NULL;
   3519    pthread_mutex_destroy(&cap.mutex);
   3520}
   3521
   3522
   3523/*
   3524 * Make the source directory our root so symlinks cannot escape and no other
   3525 * files are accessible.  Assumes unshare(CLONE_NEWNS) was already called.
   3526 */
   3527static void setup_mounts(const char *source)
   3528{
   3529    int oldroot;
   3530    int newroot;
   3531
   3532    if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
   3533        fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
   3534        exit(1);
   3535    }
   3536
   3537    /* This magic is based on lxc's lxc_pivot_root() */
   3538    oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
   3539    if (oldroot < 0) {
   3540        fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
   3541        exit(1);
   3542    }
   3543
   3544    newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
   3545    if (newroot < 0) {
   3546        fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
   3547        exit(1);
   3548    }
   3549
   3550    if (fchdir(newroot) < 0) {
   3551        fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
   3552        exit(1);
   3553    }
   3554
   3555    if (syscall(__NR_pivot_root, ".", ".") < 0) {
   3556        fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
   3557        exit(1);
   3558    }
   3559
   3560    if (fchdir(oldroot) < 0) {
   3561        fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
   3562        exit(1);
   3563    }
   3564
   3565    if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
   3566        fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
   3567        exit(1);
   3568    }
   3569
   3570    if (umount2(".", MNT_DETACH) < 0) {
   3571        fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
   3572        exit(1);
   3573    }
   3574
   3575    if (fchdir(newroot) < 0) {
   3576        fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
   3577        exit(1);
   3578    }
   3579
   3580    close(newroot);
   3581    close(oldroot);
   3582}
   3583
   3584/*
   3585 * Only keep capabilities in allowlist that are needed for file system operation
   3586 * The (possibly NULL) modcaps_in string passed in is free'd before exit.
   3587 */
   3588static void setup_capabilities(char *modcaps_in)
   3589{
   3590    char *modcaps = modcaps_in;
   3591    pthread_mutex_lock(&cap.mutex);
   3592    capng_restore_state(&cap.saved);
   3593
   3594    /*
   3595     * Add to allowlist file system-related capabilities that are needed for a
   3596     * file server to act like root.  Drop everything else like networking and
   3597     * sysadmin capabilities.
   3598     *
   3599     * Exclusions:
   3600     * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
   3601     *    and we don't support that.
   3602     * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
   3603     *    used by the Smack LSM.  Omit it until there is demand for it.
   3604     */
   3605    capng_setpid(syscall(SYS_gettid));
   3606    capng_clear(CAPNG_SELECT_BOTH);
   3607    if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
   3608            CAP_CHOWN,
   3609            CAP_DAC_OVERRIDE,
   3610            CAP_FOWNER,
   3611            CAP_FSETID,
   3612            CAP_SETGID,
   3613            CAP_SETUID,
   3614            CAP_MKNOD,
   3615            CAP_SETFCAP,
   3616            -1)) {
   3617        fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__);
   3618        exit(1);
   3619    }
   3620
   3621    /*
   3622     * The modcaps option is a colon separated list of caps,
   3623     * each preceded by either + or -.
   3624     */
   3625    while (modcaps) {
   3626        capng_act_t action;
   3627        int cap;
   3628
   3629        char *next = strchr(modcaps, ':');
   3630        if (next) {
   3631            *next = '\0';
   3632            next++;
   3633        }
   3634
   3635        switch (modcaps[0]) {
   3636        case '+':
   3637            action = CAPNG_ADD;
   3638            break;
   3639
   3640        case '-':
   3641            action = CAPNG_DROP;
   3642            break;
   3643
   3644        default:
   3645            fuse_log(FUSE_LOG_ERR,
   3646                     "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
   3647                     __func__, modcaps[0]);
   3648            exit(1);
   3649        }
   3650        cap = capng_name_to_capability(modcaps + 1);
   3651        if (cap < 0) {
   3652            fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__,
   3653                     modcaps);
   3654            exit(1);
   3655        }
   3656        if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) {
   3657            fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n",
   3658                     __func__, modcaps);
   3659            exit(1);
   3660        }
   3661
   3662        modcaps = next;
   3663    }
   3664    g_free(modcaps_in);
   3665
   3666    if (capng_apply(CAPNG_SELECT_BOTH)) {
   3667        fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__);
   3668        exit(1);
   3669    }
   3670
   3671    cap.saved = capng_save_state();
   3672    if (!cap.saved) {
   3673        fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__);
   3674        exit(1);
   3675    }
   3676    pthread_mutex_unlock(&cap.mutex);
   3677}
   3678
   3679/*
   3680 * Use chroot as a weaker sandbox for environments where the process is
   3681 * launched without CAP_SYS_ADMIN.
   3682 */
   3683static void setup_chroot(struct lo_data *lo)
   3684{
   3685    lo->proc_self_fd = open("/proc/self/fd", O_PATH);
   3686    if (lo->proc_self_fd == -1) {
   3687        fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n");
   3688        exit(1);
   3689    }
   3690
   3691    /*
   3692     * Make the shared directory the file system root so that FUSE_OPEN
   3693     * (lo_open()) cannot escape the shared directory by opening a symlink.
   3694     *
   3695     * The chroot(2) syscall is later disabled by seccomp and the
   3696     * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
   3697     * is not possible.
   3698     *
   3699     * However, it's still possible to escape the chroot via lo->proc_self_fd
   3700     * but that requires first gaining control of the process.
   3701     */
   3702    if (chroot(lo->source) != 0) {
   3703        fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source);
   3704        exit(1);
   3705    }
   3706
   3707    /* Move into the chroot */
   3708    if (chdir("/") != 0) {
   3709        fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n");
   3710        exit(1);
   3711    }
   3712}
   3713
   3714/*
   3715 * Lock down this process to prevent access to other processes or files outside
   3716 * source directory.  This reduces the impact of arbitrary code execution bugs.
   3717 */
   3718static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
   3719                          bool enable_syslog)
   3720{
   3721    if (lo->sandbox == SANDBOX_NAMESPACE) {
   3722        setup_namespaces(lo, se);
   3723        setup_mounts(lo->source);
   3724    } else {
   3725        setup_chroot(lo);
   3726    }
   3727
   3728    setup_seccomp(enable_syslog);
   3729    setup_capabilities(g_strdup(lo->modcaps));
   3730}
   3731
   3732/* Set the maximum number of open file descriptors */
   3733static void setup_nofile_rlimit(unsigned long rlimit_nofile)
   3734{
   3735    struct rlimit rlim = {
   3736        .rlim_cur = rlimit_nofile,
   3737        .rlim_max = rlimit_nofile,
   3738    };
   3739
   3740    if (rlimit_nofile == 0) {
   3741        return; /* nothing to do */
   3742    }
   3743
   3744    if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
   3745        /* Ignore SELinux denials */
   3746        if (errno == EPERM) {
   3747            return;
   3748        }
   3749
   3750        fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
   3751        exit(1);
   3752    }
   3753}
   3754
   3755static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
   3756{
   3757    g_autofree char *localfmt = NULL;
   3758
   3759    if (current_log_level < level) {
   3760        return;
   3761    }
   3762
   3763    if (current_log_level == FUSE_LOG_DEBUG) {
   3764        if (use_syslog) {
   3765            /* no timestamp needed */
   3766            localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
   3767                                       fmt);
   3768        } else {
   3769            g_autoptr(GDateTime) now = g_date_time_new_now_utc();
   3770            g_autofree char *nowstr = g_date_time_format(now, "%Y-%m-%d %H:%M:%S.%f%z");
   3771            localfmt = g_strdup_printf("[%s] [ID: %08ld] %s",
   3772                                       nowstr, syscall(__NR_gettid), fmt);
   3773        }
   3774        fmt = localfmt;
   3775    }
   3776
   3777    if (use_syslog) {
   3778        int priority = LOG_ERR;
   3779        switch (level) {
   3780        case FUSE_LOG_EMERG:
   3781            priority = LOG_EMERG;
   3782            break;
   3783        case FUSE_LOG_ALERT:
   3784            priority = LOG_ALERT;
   3785            break;
   3786        case FUSE_LOG_CRIT:
   3787            priority = LOG_CRIT;
   3788            break;
   3789        case FUSE_LOG_ERR:
   3790            priority = LOG_ERR;
   3791            break;
   3792        case FUSE_LOG_WARNING:
   3793            priority = LOG_WARNING;
   3794            break;
   3795        case FUSE_LOG_NOTICE:
   3796            priority = LOG_NOTICE;
   3797            break;
   3798        case FUSE_LOG_INFO:
   3799            priority = LOG_INFO;
   3800            break;
   3801        case FUSE_LOG_DEBUG:
   3802            priority = LOG_DEBUG;
   3803            break;
   3804        }
   3805        vsyslog(priority, fmt, ap);
   3806    } else {
   3807        vfprintf(stderr, fmt, ap);
   3808    }
   3809}
   3810
   3811static void setup_root(struct lo_data *lo, struct lo_inode *root)
   3812{
   3813    int fd, res;
   3814    struct stat stat;
   3815    uint64_t mnt_id;
   3816
   3817    fd = open("/", O_PATH);
   3818    if (fd == -1) {
   3819        fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
   3820        exit(1);
   3821    }
   3822
   3823    res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
   3824                   &mnt_id);
   3825    if (res == -1) {
   3826        fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
   3827        exit(1);
   3828    }
   3829
   3830    root->filetype = S_IFDIR;
   3831    root->fd = fd;
   3832    root->key.ino = stat.st_ino;
   3833    root->key.dev = stat.st_dev;
   3834    root->key.mnt_id = mnt_id;
   3835    root->nlookup = 2;
   3836    g_atomic_int_set(&root->refcount, 2);
   3837    if (lo->posix_lock) {
   3838        pthread_mutex_init(&root->plock_mutex, NULL);
   3839        root->posix_locks = g_hash_table_new_full(
   3840            g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
   3841    }
   3842}
   3843
   3844static guint lo_key_hash(gconstpointer key)
   3845{
   3846    const struct lo_key *lkey = key;
   3847
   3848    return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id;
   3849}
   3850
   3851static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
   3852{
   3853    const struct lo_key *la = a;
   3854    const struct lo_key *lb = b;
   3855
   3856    return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
   3857}
   3858
   3859static void fuse_lo_data_cleanup(struct lo_data *lo)
   3860{
   3861    if (lo->inodes) {
   3862        g_hash_table_destroy(lo->inodes);
   3863    }
   3864
   3865    if (lo->root.posix_locks) {
   3866        g_hash_table_destroy(lo->root.posix_locks);
   3867    }
   3868    lo_map_destroy(&lo->fd_map);
   3869    lo_map_destroy(&lo->dirp_map);
   3870    lo_map_destroy(&lo->ino_map);
   3871
   3872    if (lo->proc_self_fd >= 0) {
   3873        close(lo->proc_self_fd);
   3874    }
   3875
   3876    if (lo->root.fd >= 0) {
   3877        close(lo->root.fd);
   3878    }
   3879
   3880    free(lo->xattrmap);
   3881    free_xattrmap(lo);
   3882    free(lo->xattr_security_capability);
   3883    free(lo->source);
   3884}
   3885
   3886static void qemu_version(void)
   3887{
   3888    printf("virtiofsd version " QEMU_FULL_VERSION "\n" QEMU_COPYRIGHT "\n");
   3889}
   3890
   3891int main(int argc, char *argv[])
   3892{
   3893    struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
   3894    struct fuse_session *se;
   3895    struct fuse_cmdline_opts opts;
   3896    struct lo_data lo = {
   3897        .sandbox = SANDBOX_NAMESPACE,
   3898        .debug = 0,
   3899        .writeback = 0,
   3900        .posix_lock = 0,
   3901        .allow_direct_io = 0,
   3902        .proc_self_fd = -1,
   3903        .user_killpriv_v2 = -1,
   3904        .user_posix_acl = -1,
   3905    };
   3906    struct lo_map_elem *root_elem;
   3907    struct lo_map_elem *reserve_elem;
   3908    int ret = -1;
   3909
   3910    /* Initialize time conversion information for localtime_r(). */
   3911    tzset();
   3912
   3913    /* Don't mask creation mode, kernel already did that */
   3914    umask(0);
   3915
   3916    qemu_init_exec_dir(argv[0]);
   3917
   3918    pthread_mutex_init(&lo.mutex, NULL);
   3919    lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
   3920    lo.root.fd = -1;
   3921    lo.root.fuse_ino = FUSE_ROOT_ID;
   3922    lo.cache = CACHE_AUTO;
   3923
   3924    /*
   3925     * Set up the ino map like this:
   3926     * [0] Reserved (will not be used)
   3927     * [1] Root inode
   3928     */
   3929    lo_map_init(&lo.ino_map);
   3930    reserve_elem = lo_map_reserve(&lo.ino_map, 0);
   3931    if (!reserve_elem) {
   3932        fuse_log(FUSE_LOG_ERR, "failed to alloc reserve_elem.\n");
   3933        goto err_out1;
   3934    }
   3935    reserve_elem->in_use = false;
   3936    root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
   3937    if (!root_elem) {
   3938        fuse_log(FUSE_LOG_ERR, "failed to alloc root_elem.\n");
   3939        goto err_out1;
   3940    }
   3941    root_elem->inode = &lo.root;
   3942
   3943    lo_map_init(&lo.dirp_map);
   3944    lo_map_init(&lo.fd_map);
   3945
   3946    if (fuse_parse_cmdline(&args, &opts) != 0) {
   3947        goto err_out1;
   3948    }
   3949    fuse_set_log_func(log_func);
   3950    use_syslog = opts.syslog;
   3951    if (use_syslog) {
   3952        openlog("virtiofsd", LOG_PID, LOG_DAEMON);
   3953    }
   3954
   3955    if (opts.show_help) {
   3956        printf("usage: %s [options]\n\n", argv[0]);
   3957        fuse_cmdline_help();
   3958        printf("    -o source=PATH             shared directory tree\n");
   3959        fuse_lowlevel_help();
   3960        ret = 0;
   3961        goto err_out1;
   3962    } else if (opts.show_version) {
   3963        qemu_version();
   3964        fuse_lowlevel_version();
   3965        ret = 0;
   3966        goto err_out1;
   3967    } else if (opts.print_capabilities) {
   3968        print_capabilities();
   3969        ret = 0;
   3970        goto err_out1;
   3971    }
   3972
   3973    if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
   3974        goto err_out1;
   3975    }
   3976
   3977    if (opts.log_level != 0) {
   3978        current_log_level = opts.log_level;
   3979    } else {
   3980        /* default log level is INFO */
   3981        current_log_level = FUSE_LOG_INFO;
   3982    }
   3983    lo.debug = opts.debug;
   3984    if (lo.debug) {
   3985        current_log_level = FUSE_LOG_DEBUG;
   3986    }
   3987    if (lo.source) {
   3988        struct stat stat;
   3989        int res;
   3990
   3991        res = lstat(lo.source, &stat);
   3992        if (res == -1) {
   3993            fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
   3994                     lo.source);
   3995            exit(1);
   3996        }
   3997        if (!S_ISDIR(stat.st_mode)) {
   3998            fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
   3999            exit(1);
   4000        }
   4001    } else {
   4002        lo.source = strdup("/");
   4003        if (!lo.source) {
   4004            fuse_log(FUSE_LOG_ERR, "failed to strdup source\n");
   4005            goto err_out1;
   4006        }
   4007    }
   4008
   4009    if (lo.xattrmap) {
   4010        lo.xattr = 1;
   4011        parse_xattrmap(&lo);
   4012    }
   4013
   4014    if (!lo.timeout_set) {
   4015        switch (lo.cache) {
   4016        case CACHE_NONE:
   4017            lo.timeout = 0.0;
   4018            break;
   4019
   4020        case CACHE_AUTO:
   4021            lo.timeout = 1.0;
   4022            break;
   4023
   4024        case CACHE_ALWAYS:
   4025            lo.timeout = 86400.0;
   4026            break;
   4027        }
   4028    } else if (lo.timeout < 0) {
   4029        fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
   4030        exit(1);
   4031    }
   4032
   4033    if (lo.user_posix_acl == 1 && !lo.xattr) {
   4034        fuse_log(FUSE_LOG_ERR, "Can't enable posix ACLs. xattrs are disabled."
   4035                 "\n");
   4036        exit(1);
   4037    }
   4038
   4039    lo.use_statx = true;
   4040
   4041    se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
   4042    if (se == NULL) {
   4043        goto err_out1;
   4044    }
   4045
   4046    if (fuse_set_signal_handlers(se) != 0) {
   4047        goto err_out2;
   4048    }
   4049
   4050    if (fuse_session_mount(se) != 0) {
   4051        goto err_out3;
   4052    }
   4053
   4054    fuse_daemonize(opts.foreground);
   4055
   4056    setup_nofile_rlimit(opts.rlimit_nofile);
   4057
   4058    /* Must be before sandbox since it wants /proc */
   4059    setup_capng();
   4060
   4061    setup_sandbox(&lo, se, opts.syslog);
   4062
   4063    setup_root(&lo, &lo.root);
   4064    /* Block until ctrl+c or fusermount -u */
   4065    ret = virtio_loop(se);
   4066
   4067    fuse_session_unmount(se);
   4068    cleanup_capng();
   4069err_out3:
   4070    fuse_remove_signal_handlers(se);
   4071err_out2:
   4072    fuse_session_destroy(se);
   4073err_out1:
   4074    fuse_opt_free_args(&args);
   4075
   4076    fuse_lo_data_cleanup(&lo);
   4077
   4078    return ret ? 1 : 0;
   4079}