fs.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
fs.c (38796B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Landlock LSM - Filesystem management and hooks
      4 *
      5 * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
      6 * Copyright © 2018-2020 ANSSI
      7 * Copyright © 2021-2022 Microsoft Corporation
      8 */
      9
     10#include <linux/atomic.h>
     11#include <linux/bitops.h>
     12#include <linux/bits.h>
     13#include <linux/compiler_types.h>
     14#include <linux/dcache.h>
     15#include <linux/err.h>
     16#include <linux/fs.h>
     17#include <linux/init.h>
     18#include <linux/kernel.h>
     19#include <linux/limits.h>
     20#include <linux/list.h>
     21#include <linux/lsm_hooks.h>
     22#include <linux/mount.h>
     23#include <linux/namei.h>
     24#include <linux/path.h>
     25#include <linux/rcupdate.h>
     26#include <linux/spinlock.h>
     27#include <linux/stat.h>
     28#include <linux/types.h>
     29#include <linux/wait_bit.h>
     30#include <linux/workqueue.h>
     31#include <uapi/linux/landlock.h>
     32
     33#include "common.h"
     34#include "cred.h"
     35#include "fs.h"
     36#include "limits.h"
     37#include "object.h"
     38#include "ruleset.h"
     39#include "setup.h"
     40
     41/* Underlying object management */
     42
     43static void release_inode(struct landlock_object *const object)
     44	__releases(object->lock)
     45{
     46	struct inode *const inode = object->underobj;
     47	struct super_block *sb;
     48
     49	if (!inode) {
     50		spin_unlock(&object->lock);
     51		return;
     52	}
     53
     54	/*
     55	 * Protects against concurrent use by hook_sb_delete() of the reference
     56	 * to the underlying inode.
     57	 */
     58	object->underobj = NULL;
     59	/*
     60	 * Makes sure that if the filesystem is concurrently unmounted,
     61	 * hook_sb_delete() will wait for us to finish iput().
     62	 */
     63	sb = inode->i_sb;
     64	atomic_long_inc(&landlock_superblock(sb)->inode_refs);
     65	spin_unlock(&object->lock);
     66	/*
     67	 * Because object->underobj was not NULL, hook_sb_delete() and
     68	 * get_inode_object() guarantee that it is safe to reset
     69	 * landlock_inode(inode)->object while it is not NULL.  It is therefore
     70	 * not necessary to lock inode->i_lock.
     71	 */
     72	rcu_assign_pointer(landlock_inode(inode)->object, NULL);
     73	/*
     74	 * Now, new rules can safely be tied to @inode with get_inode_object().
     75	 */
     76
     77	iput(inode);
     78	if (atomic_long_dec_and_test(&landlock_superblock(sb)->inode_refs))
     79		wake_up_var(&landlock_superblock(sb)->inode_refs);
     80}
     81
     82static const struct landlock_object_underops landlock_fs_underops = {
     83	.release = release_inode
     84};
     85
     86/* Ruleset management */
     87
     88static struct landlock_object *get_inode_object(struct inode *const inode)
     89{
     90	struct landlock_object *object, *new_object;
     91	struct landlock_inode_security *inode_sec = landlock_inode(inode);
     92
     93	rcu_read_lock();
     94retry:
     95	object = rcu_dereference(inode_sec->object);
     96	if (object) {
     97		if (likely(refcount_inc_not_zero(&object->usage))) {
     98			rcu_read_unlock();
     99			return object;
    100		}
    101		/*
    102		 * We are racing with release_inode(), the object is going
    103		 * away.  Wait for release_inode(), then retry.
    104		 */
    105		spin_lock(&object->lock);
    106		spin_unlock(&object->lock);
    107		goto retry;
    108	}
    109	rcu_read_unlock();
    110
    111	/*
    112	 * If there is no object tied to @inode, then create a new one (without
    113	 * holding any locks).
    114	 */
    115	new_object = landlock_create_object(&landlock_fs_underops, inode);
    116	if (IS_ERR(new_object))
    117		return new_object;
    118
    119	/*
    120	 * Protects against concurrent calls to get_inode_object() or
    121	 * hook_sb_delete().
    122	 */
    123	spin_lock(&inode->i_lock);
    124	if (unlikely(rcu_access_pointer(inode_sec->object))) {
    125		/* Someone else just created the object, bail out and retry. */
    126		spin_unlock(&inode->i_lock);
    127		kfree(new_object);
    128
    129		rcu_read_lock();
    130		goto retry;
    131	}
    132
    133	/*
    134	 * @inode will be released by hook_sb_delete() on its superblock
    135	 * shutdown, or by release_inode() when no more ruleset references the
    136	 * related object.
    137	 */
    138	ihold(inode);
    139	rcu_assign_pointer(inode_sec->object, new_object);
    140	spin_unlock(&inode->i_lock);
    141	return new_object;
    142}
    143
    144/* All access rights that can be tied to files. */
    145/* clang-format off */
    146#define ACCESS_FILE ( \
    147	LANDLOCK_ACCESS_FS_EXECUTE | \
    148	LANDLOCK_ACCESS_FS_WRITE_FILE | \
    149	LANDLOCK_ACCESS_FS_READ_FILE)
    150/* clang-format on */
    151
    152/*
    153 * @path: Should have been checked by get_path_from_fd().
    154 */
    155int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
    156			    const struct path *const path,
    157			    access_mask_t access_rights)
    158{
    159	int err;
    160	struct landlock_object *object;
    161
    162	/* Files only get access rights that make sense. */
    163	if (!d_is_dir(path->dentry) &&
    164	    (access_rights | ACCESS_FILE) != ACCESS_FILE)
    165		return -EINVAL;
    166	if (WARN_ON_ONCE(ruleset->num_layers != 1))
    167		return -EINVAL;
    168
    169	/* Transforms relative access rights to absolute ones. */
    170	access_rights |= LANDLOCK_MASK_ACCESS_FS & ~ruleset->fs_access_masks[0];
    171	object = get_inode_object(d_backing_inode(path->dentry));
    172	if (IS_ERR(object))
    173		return PTR_ERR(object);
    174	mutex_lock(&ruleset->lock);
    175	err = landlock_insert_rule(ruleset, object, access_rights);
    176	mutex_unlock(&ruleset->lock);
    177	/*
    178	 * No need to check for an error because landlock_insert_rule()
    179	 * increments the refcount for the new object if needed.
    180	 */
    181	landlock_put_object(object);
    182	return err;
    183}
    184
    185/* Access-control management */
    186
    187/*
    188 * The lifetime of the returned rule is tied to @domain.
    189 *
    190 * Returns NULL if no rule is found or if @dentry is negative.
    191 */
    192static inline const struct landlock_rule *
    193find_rule(const struct landlock_ruleset *const domain,
    194	  const struct dentry *const dentry)
    195{
    196	const struct landlock_rule *rule;
    197	const struct inode *inode;
    198
    199	/* Ignores nonexistent leafs. */
    200	if (d_is_negative(dentry))
    201		return NULL;
    202
    203	inode = d_backing_inode(dentry);
    204	rcu_read_lock();
    205	rule = landlock_find_rule(
    206		domain, rcu_dereference(landlock_inode(inode)->object));
    207	rcu_read_unlock();
    208	return rule;
    209}
    210
    211/*
    212 * @layer_masks is read and may be updated according to the access request and
    213 * the matching rule.
    214 *
    215 * Returns true if the request is allowed (i.e. relevant layer masks for the
    216 * request are empty).
    217 */
    218static inline bool
    219unmask_layers(const struct landlock_rule *const rule,
    220	      const access_mask_t access_request,
    221	      layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
    222{
    223	size_t layer_level;
    224
    225	if (!access_request || !layer_masks)
    226		return true;
    227	if (!rule)
    228		return false;
    229
    230	/*
    231	 * An access is granted if, for each policy layer, at least one rule
    232	 * encountered on the pathwalk grants the requested access,
    233	 * regardless of its position in the layer stack.  We must then check
    234	 * the remaining layers for each inode, from the first added layer to
    235	 * the last one.  When there is multiple requested accesses, for each
    236	 * policy layer, the full set of requested accesses may not be granted
    237	 * by only one rule, but by the union (binary OR) of multiple rules.
    238	 * E.g. /a/b <execute> + /a <read> => /a/b <execute + read>
    239	 */
    240	for (layer_level = 0; layer_level < rule->num_layers; layer_level++) {
    241		const struct landlock_layer *const layer =
    242			&rule->layers[layer_level];
    243		const layer_mask_t layer_bit = BIT_ULL(layer->level - 1);
    244		const unsigned long access_req = access_request;
    245		unsigned long access_bit;
    246		bool is_empty;
    247
    248		/*
    249		 * Records in @layer_masks which layer grants access to each
    250		 * requested access.
    251		 */
    252		is_empty = true;
    253		for_each_set_bit(access_bit, &access_req,
    254				 ARRAY_SIZE(*layer_masks)) {
    255			if (layer->access & BIT_ULL(access_bit))
    256				(*layer_masks)[access_bit] &= ~layer_bit;
    257			is_empty = is_empty && !(*layer_masks)[access_bit];
    258		}
    259		if (is_empty)
    260			return true;
    261	}
    262	return false;
    263}
    264
    265/*
    266 * Allows access to pseudo filesystems that will never be mountable (e.g.
    267 * sockfs, pipefs), but can still be reachable through
    268 * /proc/<pid>/fd/<file-descriptor>
    269 */
    270static inline bool is_nouser_or_private(const struct dentry *dentry)
    271{
    272	return (dentry->d_sb->s_flags & SB_NOUSER) ||
    273	       (d_is_positive(dentry) &&
    274		unlikely(IS_PRIVATE(d_backing_inode(dentry))));
    275}
    276
    277static inline access_mask_t
    278get_handled_accesses(const struct landlock_ruleset *const domain)
    279{
    280	access_mask_t access_dom = 0;
    281	unsigned long access_bit;
    282
    283	for (access_bit = 0; access_bit < LANDLOCK_NUM_ACCESS_FS;
    284	     access_bit++) {
    285		size_t layer_level;
    286
    287		for (layer_level = 0; layer_level < domain->num_layers;
    288		     layer_level++) {
    289			if (domain->fs_access_masks[layer_level] &
    290			    BIT_ULL(access_bit)) {
    291				access_dom |= BIT_ULL(access_bit);
    292				break;
    293			}
    294		}
    295	}
    296	return access_dom;
    297}
    298
    299static inline access_mask_t
    300init_layer_masks(const struct landlock_ruleset *const domain,
    301		 const access_mask_t access_request,
    302		 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
    303{
    304	access_mask_t handled_accesses = 0;
    305	size_t layer_level;
    306
    307	memset(layer_masks, 0, sizeof(*layer_masks));
    308	/* An empty access request can happen because of O_WRONLY | O_RDWR. */
    309	if (!access_request)
    310		return 0;
    311
    312	/* Saves all handled accesses per layer. */
    313	for (layer_level = 0; layer_level < domain->num_layers; layer_level++) {
    314		const unsigned long access_req = access_request;
    315		unsigned long access_bit;
    316
    317		for_each_set_bit(access_bit, &access_req,
    318				 ARRAY_SIZE(*layer_masks)) {
    319			if (domain->fs_access_masks[layer_level] &
    320			    BIT_ULL(access_bit)) {
    321				(*layer_masks)[access_bit] |=
    322					BIT_ULL(layer_level);
    323				handled_accesses |= BIT_ULL(access_bit);
    324			}
    325		}
    326	}
    327	return handled_accesses;
    328}
    329
    330/*
    331 * Check that a destination file hierarchy has more restrictions than a source
    332 * file hierarchy.  This is only used for link and rename actions.
    333 *
    334 * @layer_masks_child2: Optional child masks.
    335 */
    336static inline bool no_more_access(
    337	const layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
    338	const layer_mask_t (*const layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS],
    339	const bool child1_is_directory,
    340	const layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
    341	const layer_mask_t (*const layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS],
    342	const bool child2_is_directory)
    343{
    344	unsigned long access_bit;
    345
    346	for (access_bit = 0; access_bit < ARRAY_SIZE(*layer_masks_parent2);
    347	     access_bit++) {
    348		/* Ignores accesses that only make sense for directories. */
    349		const bool is_file_access =
    350			!!(BIT_ULL(access_bit) & ACCESS_FILE);
    351
    352		if (child1_is_directory || is_file_access) {
    353			/*
    354			 * Checks if the destination restrictions are a
    355			 * superset of the source ones (i.e. inherited access
    356			 * rights without child exceptions):
    357			 * restrictions(parent2) >= restrictions(child1)
    358			 */
    359			if ((((*layer_masks_parent1)[access_bit] &
    360			      (*layer_masks_child1)[access_bit]) |
    361			     (*layer_masks_parent2)[access_bit]) !=
    362			    (*layer_masks_parent2)[access_bit])
    363				return false;
    364		}
    365
    366		if (!layer_masks_child2)
    367			continue;
    368		if (child2_is_directory || is_file_access) {
    369			/*
    370			 * Checks inverted restrictions for RENAME_EXCHANGE:
    371			 * restrictions(parent1) >= restrictions(child2)
    372			 */
    373			if ((((*layer_masks_parent2)[access_bit] &
    374			      (*layer_masks_child2)[access_bit]) |
    375			     (*layer_masks_parent1)[access_bit]) !=
    376			    (*layer_masks_parent1)[access_bit])
    377				return false;
    378		}
    379	}
    380	return true;
    381}
    382
    383/*
    384 * Removes @layer_masks accesses that are not requested.
    385 *
    386 * Returns true if the request is allowed, false otherwise.
    387 */
    388static inline bool
    389scope_to_request(const access_mask_t access_request,
    390		 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
    391{
    392	const unsigned long access_req = access_request;
    393	unsigned long access_bit;
    394
    395	if (WARN_ON_ONCE(!layer_masks))
    396		return true;
    397
    398	for_each_clear_bit(access_bit, &access_req, ARRAY_SIZE(*layer_masks))
    399		(*layer_masks)[access_bit] = 0;
    400	return !memchr_inv(layer_masks, 0, sizeof(*layer_masks));
    401}
    402
    403/*
    404 * Returns true if there is at least one access right different than
    405 * LANDLOCK_ACCESS_FS_REFER.
    406 */
    407static inline bool
    408is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS],
    409	  const access_mask_t access_request)
    410{
    411	unsigned long access_bit;
    412	/* LANDLOCK_ACCESS_FS_REFER alone must return -EXDEV. */
    413	const unsigned long access_check = access_request &
    414					   ~LANDLOCK_ACCESS_FS_REFER;
    415
    416	if (!layer_masks)
    417		return false;
    418
    419	for_each_set_bit(access_bit, &access_check, ARRAY_SIZE(*layer_masks)) {
    420		if ((*layer_masks)[access_bit])
    421			return true;
    422	}
    423	return false;
    424}
    425
    426/**
    427 * check_access_path_dual - Check accesses for requests with a common path
    428 *
    429 * @domain: Domain to check against.
    430 * @path: File hierarchy to walk through.
    431 * @access_request_parent1: Accesses to check, once @layer_masks_parent1 is
    432 *     equal to @layer_masks_parent2 (if any).  This is tied to the unique
    433 *     requested path for most actions, or the source in case of a refer action
    434 *     (i.e. rename or link), or the source and destination in case of
    435 *     RENAME_EXCHANGE.
    436 * @layer_masks_parent1: Pointer to a matrix of layer masks per access
    437 *     masks, identifying the layers that forbid a specific access.  Bits from
    438 *     this matrix can be unset according to the @path walk.  An empty matrix
    439 *     means that @domain allows all possible Landlock accesses (i.e. not only
    440 *     those identified by @access_request_parent1).  This matrix can
    441 *     initially refer to domain layer masks and, when the accesses for the
    442 *     destination and source are the same, to requested layer masks.
    443 * @dentry_child1: Dentry to the initial child of the parent1 path.  This
    444 *     pointer must be NULL for non-refer actions (i.e. not link nor rename).
    445 * @access_request_parent2: Similar to @access_request_parent1 but for a
    446 *     request involving a source and a destination.  This refers to the
    447 *     destination, except in case of RENAME_EXCHANGE where it also refers to
    448 *     the source.  Must be set to 0 when using a simple path request.
    449 * @layer_masks_parent2: Similar to @layer_masks_parent1 but for a refer
    450 *     action.  This must be NULL otherwise.
    451 * @dentry_child2: Dentry to the initial child of the parent2 path.  This
    452 *     pointer is only set for RENAME_EXCHANGE actions and must be NULL
    453 *     otherwise.
    454 *
    455 * This helper first checks that the destination has a superset of restrictions
    456 * compared to the source (if any) for a common path.  Because of
    457 * RENAME_EXCHANGE actions, source and destinations may be swapped.  It then
    458 * checks that the collected accesses and the remaining ones are enough to
    459 * allow the request.
    460 *
    461 * Returns:
    462 * - 0 if the access request is granted;
    463 * - -EACCES if it is denied because of access right other than
    464 *   LANDLOCK_ACCESS_FS_REFER;
    465 * - -EXDEV if the renaming or linking would be a privileged escalation
    466 *   (according to each layered policies), or if LANDLOCK_ACCESS_FS_REFER is
    467 *   not allowed by the source or the destination.
    468 */
    469static int check_access_path_dual(
    470	const struct landlock_ruleset *const domain,
    471	const struct path *const path,
    472	const access_mask_t access_request_parent1,
    473	layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
    474	const struct dentry *const dentry_child1,
    475	const access_mask_t access_request_parent2,
    476	layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
    477	const struct dentry *const dentry_child2)
    478{
    479	bool allowed_parent1 = false, allowed_parent2 = false, is_dom_check,
    480	     child1_is_directory = true, child2_is_directory = true;
    481	struct path walker_path;
    482	access_mask_t access_masked_parent1, access_masked_parent2;
    483	layer_mask_t _layer_masks_child1[LANDLOCK_NUM_ACCESS_FS],
    484		_layer_masks_child2[LANDLOCK_NUM_ACCESS_FS];
    485	layer_mask_t(*layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS] = NULL,
    486	(*layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS] = NULL;
    487
    488	if (!access_request_parent1 && !access_request_parent2)
    489		return 0;
    490	if (WARN_ON_ONCE(!domain || !path))
    491		return 0;
    492	if (is_nouser_or_private(path->dentry))
    493		return 0;
    494	if (WARN_ON_ONCE(domain->num_layers < 1 || !layer_masks_parent1))
    495		return -EACCES;
    496
    497	if (unlikely(layer_masks_parent2)) {
    498		if (WARN_ON_ONCE(!dentry_child1))
    499			return -EACCES;
    500		/*
    501		 * For a double request, first check for potential privilege
    502		 * escalation by looking at domain handled accesses (which are
    503		 * a superset of the meaningful requested accesses).
    504		 */
    505		access_masked_parent1 = access_masked_parent2 =
    506			get_handled_accesses(domain);
    507		is_dom_check = true;
    508	} else {
    509		if (WARN_ON_ONCE(dentry_child1 || dentry_child2))
    510			return -EACCES;
    511		/* For a simple request, only check for requested accesses. */
    512		access_masked_parent1 = access_request_parent1;
    513		access_masked_parent2 = access_request_parent2;
    514		is_dom_check = false;
    515	}
    516
    517	if (unlikely(dentry_child1)) {
    518		unmask_layers(find_rule(domain, dentry_child1),
    519			      init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
    520					       &_layer_masks_child1),
    521			      &_layer_masks_child1);
    522		layer_masks_child1 = &_layer_masks_child1;
    523		child1_is_directory = d_is_dir(dentry_child1);
    524	}
    525	if (unlikely(dentry_child2)) {
    526		unmask_layers(find_rule(domain, dentry_child2),
    527			      init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
    528					       &_layer_masks_child2),
    529			      &_layer_masks_child2);
    530		layer_masks_child2 = &_layer_masks_child2;
    531		child2_is_directory = d_is_dir(dentry_child2);
    532	}
    533
    534	walker_path = *path;
    535	path_get(&walker_path);
    536	/*
    537	 * We need to walk through all the hierarchy to not miss any relevant
    538	 * restriction.
    539	 */
    540	while (true) {
    541		struct dentry *parent_dentry;
    542		const struct landlock_rule *rule;
    543
    544		/*
    545		 * If at least all accesses allowed on the destination are
    546		 * already allowed on the source, respectively if there is at
    547		 * least as much as restrictions on the destination than on the
    548		 * source, then we can safely refer files from the source to
    549		 * the destination without risking a privilege escalation.
    550		 * This also applies in the case of RENAME_EXCHANGE, which
    551		 * implies checks on both direction.  This is crucial for
    552		 * standalone multilayered security policies.  Furthermore,
    553		 * this helps avoid policy writers to shoot themselves in the
    554		 * foot.
    555		 */
    556		if (unlikely(is_dom_check &&
    557			     no_more_access(
    558				     layer_masks_parent1, layer_masks_child1,
    559				     child1_is_directory, layer_masks_parent2,
    560				     layer_masks_child2,
    561				     child2_is_directory))) {
    562			allowed_parent1 = scope_to_request(
    563				access_request_parent1, layer_masks_parent1);
    564			allowed_parent2 = scope_to_request(
    565				access_request_parent2, layer_masks_parent2);
    566
    567			/* Stops when all accesses are granted. */
    568			if (allowed_parent1 && allowed_parent2)
    569				break;
    570
    571			/*
    572			 * Now, downgrades the remaining checks from domain
    573			 * handled accesses to requested accesses.
    574			 */
    575			is_dom_check = false;
    576			access_masked_parent1 = access_request_parent1;
    577			access_masked_parent2 = access_request_parent2;
    578		}
    579
    580		rule = find_rule(domain, walker_path.dentry);
    581		allowed_parent1 = unmask_layers(rule, access_masked_parent1,
    582						layer_masks_parent1);
    583		allowed_parent2 = unmask_layers(rule, access_masked_parent2,
    584						layer_masks_parent2);
    585
    586		/* Stops when a rule from each layer grants access. */
    587		if (allowed_parent1 && allowed_parent2)
    588			break;
    589
    590jump_up:
    591		if (walker_path.dentry == walker_path.mnt->mnt_root) {
    592			if (follow_up(&walker_path)) {
    593				/* Ignores hidden mount points. */
    594				goto jump_up;
    595			} else {
    596				/*
    597				 * Stops at the real root.  Denies access
    598				 * because not all layers have granted access.
    599				 */
    600				break;
    601			}
    602		}
    603		if (unlikely(IS_ROOT(walker_path.dentry))) {
    604			/*
    605			 * Stops at disconnected root directories.  Only allows
    606			 * access to internal filesystems (e.g. nsfs, which is
    607			 * reachable through /proc/<pid>/ns/<namespace>).
    608			 */
    609			allowed_parent1 = allowed_parent2 =
    610				!!(walker_path.mnt->mnt_flags & MNT_INTERNAL);
    611			break;
    612		}
    613		parent_dentry = dget_parent(walker_path.dentry);
    614		dput(walker_path.dentry);
    615		walker_path.dentry = parent_dentry;
    616	}
    617	path_put(&walker_path);
    618
    619	if (allowed_parent1 && allowed_parent2)
    620		return 0;
    621
    622	/*
    623	 * This prioritizes EACCES over EXDEV for all actions, including
    624	 * renames with RENAME_EXCHANGE.
    625	 */
    626	if (likely(is_eacces(layer_masks_parent1, access_request_parent1) ||
    627		   is_eacces(layer_masks_parent2, access_request_parent2)))
    628		return -EACCES;
    629
    630	/*
    631	 * Gracefully forbids reparenting if the destination directory
    632	 * hierarchy is not a superset of restrictions of the source directory
    633	 * hierarchy, or if LANDLOCK_ACCESS_FS_REFER is not allowed by the
    634	 * source or the destination.
    635	 */
    636	return -EXDEV;
    637}
    638
    639static inline int check_access_path(const struct landlock_ruleset *const domain,
    640				    const struct path *const path,
    641				    access_mask_t access_request)
    642{
    643	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
    644
    645	access_request = init_layer_masks(domain, access_request, &layer_masks);
    646	return check_access_path_dual(domain, path, access_request,
    647				      &layer_masks, NULL, 0, NULL, NULL);
    648}
    649
    650static inline int current_check_access_path(const struct path *const path,
    651					    const access_mask_t access_request)
    652{
    653	const struct landlock_ruleset *const dom =
    654		landlock_get_current_domain();
    655
    656	if (!dom)
    657		return 0;
    658	return check_access_path(dom, path, access_request);
    659}
    660
    661static inline access_mask_t get_mode_access(const umode_t mode)
    662{
    663	switch (mode & S_IFMT) {
    664	case S_IFLNK:
    665		return LANDLOCK_ACCESS_FS_MAKE_SYM;
    666	case 0:
    667		/* A zero mode translates to S_IFREG. */
    668	case S_IFREG:
    669		return LANDLOCK_ACCESS_FS_MAKE_REG;
    670	case S_IFDIR:
    671		return LANDLOCK_ACCESS_FS_MAKE_DIR;
    672	case S_IFCHR:
    673		return LANDLOCK_ACCESS_FS_MAKE_CHAR;
    674	case S_IFBLK:
    675		return LANDLOCK_ACCESS_FS_MAKE_BLOCK;
    676	case S_IFIFO:
    677		return LANDLOCK_ACCESS_FS_MAKE_FIFO;
    678	case S_IFSOCK:
    679		return LANDLOCK_ACCESS_FS_MAKE_SOCK;
    680	default:
    681		WARN_ON_ONCE(1);
    682		return 0;
    683	}
    684}
    685
    686static inline access_mask_t maybe_remove(const struct dentry *const dentry)
    687{
    688	if (d_is_negative(dentry))
    689		return 0;
    690	return d_is_dir(dentry) ? LANDLOCK_ACCESS_FS_REMOVE_DIR :
    691				  LANDLOCK_ACCESS_FS_REMOVE_FILE;
    692}
    693
    694/**
    695 * collect_domain_accesses - Walk through a file path and collect accesses
    696 *
    697 * @domain: Domain to check against.
    698 * @mnt_root: Last directory to check.
    699 * @dir: Directory to start the walk from.
    700 * @layer_masks_dom: Where to store the collected accesses.
    701 *
    702 * This helper is useful to begin a path walk from the @dir directory to a
    703 * @mnt_root directory used as a mount point.  This mount point is the common
    704 * ancestor between the source and the destination of a renamed and linked
    705 * file.  While walking from @dir to @mnt_root, we record all the domain's
    706 * allowed accesses in @layer_masks_dom.
    707 *
    708 * This is similar to check_access_path_dual() but much simpler because it only
    709 * handles walking on the same mount point and only check one set of accesses.
    710 *
    711 * Returns:
    712 * - true if all the domain access rights are allowed for @dir;
    713 * - false if the walk reached @mnt_root.
    714 */
    715static bool collect_domain_accesses(
    716	const struct landlock_ruleset *const domain,
    717	const struct dentry *const mnt_root, struct dentry *dir,
    718	layer_mask_t (*const layer_masks_dom)[LANDLOCK_NUM_ACCESS_FS])
    719{
    720	unsigned long access_dom;
    721	bool ret = false;
    722
    723	if (WARN_ON_ONCE(!domain || !mnt_root || !dir || !layer_masks_dom))
    724		return true;
    725	if (is_nouser_or_private(dir))
    726		return true;
    727
    728	access_dom = init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
    729				      layer_masks_dom);
    730
    731	dget(dir);
    732	while (true) {
    733		struct dentry *parent_dentry;
    734
    735		/* Gets all layers allowing all domain accesses. */
    736		if (unmask_layers(find_rule(domain, dir), access_dom,
    737				  layer_masks_dom)) {
    738			/*
    739			 * Stops when all handled accesses are allowed by at
    740			 * least one rule in each layer.
    741			 */
    742			ret = true;
    743			break;
    744		}
    745
    746		/* We should not reach a root other than @mnt_root. */
    747		if (dir == mnt_root || WARN_ON_ONCE(IS_ROOT(dir)))
    748			break;
    749
    750		parent_dentry = dget_parent(dir);
    751		dput(dir);
    752		dir = parent_dentry;
    753	}
    754	dput(dir);
    755	return ret;
    756}
    757
    758/**
    759 * current_check_refer_path - Check if a rename or link action is allowed
    760 *
    761 * @old_dentry: File or directory requested to be moved or linked.
    762 * @new_dir: Destination parent directory.
    763 * @new_dentry: Destination file or directory.
    764 * @removable: Sets to true if it is a rename operation.
    765 * @exchange: Sets to true if it is a rename operation with RENAME_EXCHANGE.
    766 *
    767 * Because of its unprivileged constraints, Landlock relies on file hierarchies
    768 * (and not only inodes) to tie access rights to files.  Being able to link or
    769 * rename a file hierarchy brings some challenges.  Indeed, moving or linking a
    770 * file (i.e. creating a new reference to an inode) can have an impact on the
    771 * actions allowed for a set of files if it would change its parent directory
    772 * (i.e. reparenting).
    773 *
    774 * To avoid trivial access right bypasses, Landlock first checks if the file or
    775 * directory requested to be moved would gain new access rights inherited from
    776 * its new hierarchy.  Before returning any error, Landlock then checks that
    777 * the parent source hierarchy and the destination hierarchy would allow the
    778 * link or rename action.  If it is not the case, an error with EACCES is
    779 * returned to inform user space that there is no way to remove or create the
    780 * requested source file type.  If it should be allowed but the new inherited
    781 * access rights would be greater than the source access rights, then the
    782 * kernel returns an error with EXDEV.  Prioritizing EACCES over EXDEV enables
    783 * user space to abort the whole operation if there is no way to do it, or to
    784 * manually copy the source to the destination if this remains allowed, e.g.
    785 * because file creation is allowed on the destination directory but not direct
    786 * linking.
    787 *
    788 * To achieve this goal, the kernel needs to compare two file hierarchies: the
    789 * one identifying the source file or directory (including itself), and the
    790 * destination one.  This can be seen as a multilayer partial ordering problem.
    791 * The kernel walks through these paths and collects in a matrix the access
    792 * rights that are denied per layer.  These matrices are then compared to see
    793 * if the destination one has more (or the same) restrictions as the source
    794 * one.  If this is the case, the requested action will not return EXDEV, which
    795 * doesn't mean the action is allowed.  The parent hierarchy of the source
    796 * (i.e. parent directory), and the destination hierarchy must also be checked
    797 * to verify that they explicitly allow such action (i.e.  referencing,
    798 * creation and potentially removal rights).  The kernel implementation is then
    799 * required to rely on potentially four matrices of access rights: one for the
    800 * source file or directory (i.e. the child), a potentially other one for the
    801 * other source/destination (in case of RENAME_EXCHANGE), one for the source
    802 * parent hierarchy and a last one for the destination hierarchy.  These
    803 * ephemeral matrices take some space on the stack, which limits the number of
    804 * layers to a deemed reasonable number: 16.
    805 *
    806 * Returns:
    807 * - 0 if access is allowed;
    808 * - -EXDEV if @old_dentry would inherit new access rights from @new_dir;
    809 * - -EACCES if file removal or creation is denied.
    810 */
    811static int current_check_refer_path(struct dentry *const old_dentry,
    812				    const struct path *const new_dir,
    813				    struct dentry *const new_dentry,
    814				    const bool removable, const bool exchange)
    815{
    816	const struct landlock_ruleset *const dom =
    817		landlock_get_current_domain();
    818	bool allow_parent1, allow_parent2;
    819	access_mask_t access_request_parent1, access_request_parent2;
    820	struct path mnt_dir;
    821	layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS],
    822		layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS];
    823
    824	if (!dom)
    825		return 0;
    826	if (WARN_ON_ONCE(dom->num_layers < 1))
    827		return -EACCES;
    828	if (unlikely(d_is_negative(old_dentry)))
    829		return -ENOENT;
    830	if (exchange) {
    831		if (unlikely(d_is_negative(new_dentry)))
    832			return -ENOENT;
    833		access_request_parent1 =
    834			get_mode_access(d_backing_inode(new_dentry)->i_mode);
    835	} else {
    836		access_request_parent1 = 0;
    837	}
    838	access_request_parent2 =
    839		get_mode_access(d_backing_inode(old_dentry)->i_mode);
    840	if (removable) {
    841		access_request_parent1 |= maybe_remove(old_dentry);
    842		access_request_parent2 |= maybe_remove(new_dentry);
    843	}
    844
    845	/* The mount points are the same for old and new paths, cf. EXDEV. */
    846	if (old_dentry->d_parent == new_dir->dentry) {
    847		/*
    848		 * The LANDLOCK_ACCESS_FS_REFER access right is not required
    849		 * for same-directory referer (i.e. no reparenting).
    850		 */
    851		access_request_parent1 = init_layer_masks(
    852			dom, access_request_parent1 | access_request_parent2,
    853			&layer_masks_parent1);
    854		return check_access_path_dual(dom, new_dir,
    855					      access_request_parent1,
    856					      &layer_masks_parent1, NULL, 0,
    857					      NULL, NULL);
    858	}
    859
    860	/* Backward compatibility: no reparenting support. */
    861	if (!(get_handled_accesses(dom) & LANDLOCK_ACCESS_FS_REFER))
    862		return -EXDEV;
    863
    864	access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER;
    865	access_request_parent2 |= LANDLOCK_ACCESS_FS_REFER;
    866
    867	/* Saves the common mount point. */
    868	mnt_dir.mnt = new_dir->mnt;
    869	mnt_dir.dentry = new_dir->mnt->mnt_root;
    870
    871	/* new_dir->dentry is equal to new_dentry->d_parent */
    872	allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry,
    873						old_dentry->d_parent,
    874						&layer_masks_parent1);
    875	allow_parent2 = collect_domain_accesses(
    876		dom, mnt_dir.dentry, new_dir->dentry, &layer_masks_parent2);
    877
    878	if (allow_parent1 && allow_parent2)
    879		return 0;
    880
    881	/*
    882	 * To be able to compare source and destination domain access rights,
    883	 * take into account the @old_dentry access rights aggregated with its
    884	 * parent access rights.  This will be useful to compare with the
    885	 * destination parent access rights.
    886	 */
    887	return check_access_path_dual(dom, &mnt_dir, access_request_parent1,
    888				      &layer_masks_parent1, old_dentry,
    889				      access_request_parent2,
    890				      &layer_masks_parent2,
    891				      exchange ? new_dentry : NULL);
    892}
    893
    894/* Inode hooks */
    895
    896static void hook_inode_free_security(struct inode *const inode)
    897{
    898	/*
    899	 * All inodes must already have been untied from their object by
    900	 * release_inode() or hook_sb_delete().
    901	 */
    902	WARN_ON_ONCE(landlock_inode(inode)->object);
    903}
    904
    905/* Super-block hooks */
    906
    907/*
    908 * Release the inodes used in a security policy.
    909 *
    910 * Cf. fsnotify_unmount_inodes() and invalidate_inodes()
    911 */
    912static void hook_sb_delete(struct super_block *const sb)
    913{
    914	struct inode *inode, *prev_inode = NULL;
    915
    916	if (!landlock_initialized)
    917		return;
    918
    919	spin_lock(&sb->s_inode_list_lock);
    920	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
    921		struct landlock_object *object;
    922
    923		/* Only handles referenced inodes. */
    924		if (!atomic_read(&inode->i_count))
    925			continue;
    926
    927		/*
    928		 * Protects against concurrent modification of inode (e.g.
    929		 * from get_inode_object()).
    930		 */
    931		spin_lock(&inode->i_lock);
    932		/*
    933		 * Checks I_FREEING and I_WILL_FREE  to protect against a race
    934		 * condition when release_inode() just called iput(), which
    935		 * could lead to a NULL dereference of inode->security or a
    936		 * second call to iput() for the same Landlock object.  Also
    937		 * checks I_NEW because such inode cannot be tied to an object.
    938		 */
    939		if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
    940			spin_unlock(&inode->i_lock);
    941			continue;
    942		}
    943
    944		rcu_read_lock();
    945		object = rcu_dereference(landlock_inode(inode)->object);
    946		if (!object) {
    947			rcu_read_unlock();
    948			spin_unlock(&inode->i_lock);
    949			continue;
    950		}
    951		/* Keeps a reference to this inode until the next loop walk. */
    952		__iget(inode);
    953		spin_unlock(&inode->i_lock);
    954
    955		/*
    956		 * If there is no concurrent release_inode() ongoing, then we
    957		 * are in charge of calling iput() on this inode, otherwise we
    958		 * will just wait for it to finish.
    959		 */
    960		spin_lock(&object->lock);
    961		if (object->underobj == inode) {
    962			object->underobj = NULL;
    963			spin_unlock(&object->lock);
    964			rcu_read_unlock();
    965
    966			/*
    967			 * Because object->underobj was not NULL,
    968			 * release_inode() and get_inode_object() guarantee
    969			 * that it is safe to reset
    970			 * landlock_inode(inode)->object while it is not NULL.
    971			 * It is therefore not necessary to lock inode->i_lock.
    972			 */
    973			rcu_assign_pointer(landlock_inode(inode)->object, NULL);
    974			/*
    975			 * At this point, we own the ihold() reference that was
    976			 * originally set up by get_inode_object() and the
    977			 * __iget() reference that we just set in this loop
    978			 * walk.  Therefore the following call to iput() will
    979			 * not sleep nor drop the inode because there is now at
    980			 * least two references to it.
    981			 */
    982			iput(inode);
    983		} else {
    984			spin_unlock(&object->lock);
    985			rcu_read_unlock();
    986		}
    987
    988		if (prev_inode) {
    989			/*
    990			 * At this point, we still own the __iget() reference
    991			 * that we just set in this loop walk.  Therefore we
    992			 * can drop the list lock and know that the inode won't
    993			 * disappear from under us until the next loop walk.
    994			 */
    995			spin_unlock(&sb->s_inode_list_lock);
    996			/*
    997			 * We can now actually put the inode reference from the
    998			 * previous loop walk, which is not needed anymore.
    999			 */
   1000			iput(prev_inode);
   1001			cond_resched();
   1002			spin_lock(&sb->s_inode_list_lock);
   1003		}
   1004		prev_inode = inode;
   1005	}
   1006	spin_unlock(&sb->s_inode_list_lock);
   1007
   1008	/* Puts the inode reference from the last loop walk, if any. */
   1009	if (prev_inode)
   1010		iput(prev_inode);
   1011	/* Waits for pending iput() in release_inode(). */
   1012	wait_var_event(&landlock_superblock(sb)->inode_refs,
   1013		       !atomic_long_read(&landlock_superblock(sb)->inode_refs));
   1014}
   1015
   1016/*
   1017 * Because a Landlock security policy is defined according to the filesystem
   1018 * topology (i.e. the mount namespace), changing it may grant access to files
   1019 * not previously allowed.
   1020 *
   1021 * To make it simple, deny any filesystem topology modification by landlocked
   1022 * processes.  Non-landlocked processes may still change the namespace of a
   1023 * landlocked process, but this kind of threat must be handled by a system-wide
   1024 * access-control security policy.
   1025 *
   1026 * This could be lifted in the future if Landlock can safely handle mount
   1027 * namespace updates requested by a landlocked process.  Indeed, we could
   1028 * update the current domain (which is currently read-only) by taking into
   1029 * account the accesses of the source and the destination of a new mount point.
   1030 * However, it would also require to make all the child domains dynamically
   1031 * inherit these new constraints.  Anyway, for backward compatibility reasons,
   1032 * a dedicated user space option would be required (e.g. as a ruleset flag).
   1033 */
   1034static int hook_sb_mount(const char *const dev_name,
   1035			 const struct path *const path, const char *const type,
   1036			 const unsigned long flags, void *const data)
   1037{
   1038	if (!landlock_get_current_domain())
   1039		return 0;
   1040	return -EPERM;
   1041}
   1042
   1043static int hook_move_mount(const struct path *const from_path,
   1044			   const struct path *const to_path)
   1045{
   1046	if (!landlock_get_current_domain())
   1047		return 0;
   1048	return -EPERM;
   1049}
   1050
   1051/*
   1052 * Removing a mount point may reveal a previously hidden file hierarchy, which
   1053 * may then grant access to files, which may have previously been forbidden.
   1054 */
   1055static int hook_sb_umount(struct vfsmount *const mnt, const int flags)
   1056{
   1057	if (!landlock_get_current_domain())
   1058		return 0;
   1059	return -EPERM;
   1060}
   1061
   1062static int hook_sb_remount(struct super_block *const sb, void *const mnt_opts)
   1063{
   1064	if (!landlock_get_current_domain())
   1065		return 0;
   1066	return -EPERM;
   1067}
   1068
   1069/*
   1070 * pivot_root(2), like mount(2), changes the current mount namespace.  It must
   1071 * then be forbidden for a landlocked process.
   1072 *
   1073 * However, chroot(2) may be allowed because it only changes the relative root
   1074 * directory of the current process.  Moreover, it can be used to restrict the
   1075 * view of the filesystem.
   1076 */
   1077static int hook_sb_pivotroot(const struct path *const old_path,
   1078			     const struct path *const new_path)
   1079{
   1080	if (!landlock_get_current_domain())
   1081		return 0;
   1082	return -EPERM;
   1083}
   1084
   1085/* Path hooks */
   1086
   1087static int hook_path_link(struct dentry *const old_dentry,
   1088			  const struct path *const new_dir,
   1089			  struct dentry *const new_dentry)
   1090{
   1091	return current_check_refer_path(old_dentry, new_dir, new_dentry, false,
   1092					false);
   1093}
   1094
   1095static int hook_path_rename(const struct path *const old_dir,
   1096			    struct dentry *const old_dentry,
   1097			    const struct path *const new_dir,
   1098			    struct dentry *const new_dentry,
   1099			    const unsigned int flags)
   1100{
   1101	/* old_dir refers to old_dentry->d_parent and new_dir->mnt */
   1102	return current_check_refer_path(old_dentry, new_dir, new_dentry, true,
   1103					!!(flags & RENAME_EXCHANGE));
   1104}
   1105
   1106static int hook_path_mkdir(const struct path *const dir,
   1107			   struct dentry *const dentry, const umode_t mode)
   1108{
   1109	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_DIR);
   1110}
   1111
   1112static int hook_path_mknod(const struct path *const dir,
   1113			   struct dentry *const dentry, const umode_t mode,
   1114			   const unsigned int dev)
   1115{
   1116	const struct landlock_ruleset *const dom =
   1117		landlock_get_current_domain();
   1118
   1119	if (!dom)
   1120		return 0;
   1121	return check_access_path(dom, dir, get_mode_access(mode));
   1122}
   1123
   1124static int hook_path_symlink(const struct path *const dir,
   1125			     struct dentry *const dentry,
   1126			     const char *const old_name)
   1127{
   1128	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_SYM);
   1129}
   1130
   1131static int hook_path_unlink(const struct path *const dir,
   1132			    struct dentry *const dentry)
   1133{
   1134	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_FILE);
   1135}
   1136
   1137static int hook_path_rmdir(const struct path *const dir,
   1138			   struct dentry *const dentry)
   1139{
   1140	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_DIR);
   1141}
   1142
   1143/* File hooks */
   1144
   1145static inline access_mask_t get_file_access(const struct file *const file)
   1146{
   1147	access_mask_t access = 0;
   1148
   1149	if (file->f_mode & FMODE_READ) {
   1150		/* A directory can only be opened in read mode. */
   1151		if (S_ISDIR(file_inode(file)->i_mode))
   1152			return LANDLOCK_ACCESS_FS_READ_DIR;
   1153		access = LANDLOCK_ACCESS_FS_READ_FILE;
   1154	}
   1155	if (file->f_mode & FMODE_WRITE)
   1156		access |= LANDLOCK_ACCESS_FS_WRITE_FILE;
   1157	/* __FMODE_EXEC is indeed part of f_flags, not f_mode. */
   1158	if (file->f_flags & __FMODE_EXEC)
   1159		access |= LANDLOCK_ACCESS_FS_EXECUTE;
   1160	return access;
   1161}
   1162
   1163static int hook_file_open(struct file *const file)
   1164{
   1165	const struct landlock_ruleset *const dom =
   1166		landlock_get_current_domain();
   1167
   1168	if (!dom)
   1169		return 0;
   1170	/*
   1171	 * Because a file may be opened with O_PATH, get_file_access() may
   1172	 * return 0.  This case will be handled with a future Landlock
   1173	 * evolution.
   1174	 */
   1175	return check_access_path(dom, &file->f_path, get_file_access(file));
   1176}
   1177
   1178static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = {
   1179	LSM_HOOK_INIT(inode_free_security, hook_inode_free_security),
   1180
   1181	LSM_HOOK_INIT(sb_delete, hook_sb_delete),
   1182	LSM_HOOK_INIT(sb_mount, hook_sb_mount),
   1183	LSM_HOOK_INIT(move_mount, hook_move_mount),
   1184	LSM_HOOK_INIT(sb_umount, hook_sb_umount),
   1185	LSM_HOOK_INIT(sb_remount, hook_sb_remount),
   1186	LSM_HOOK_INIT(sb_pivotroot, hook_sb_pivotroot),
   1187
   1188	LSM_HOOK_INIT(path_link, hook_path_link),
   1189	LSM_HOOK_INIT(path_rename, hook_path_rename),
   1190	LSM_HOOK_INIT(path_mkdir, hook_path_mkdir),
   1191	LSM_HOOK_INIT(path_mknod, hook_path_mknod),
   1192	LSM_HOOK_INIT(path_symlink, hook_path_symlink),
   1193	LSM_HOOK_INIT(path_unlink, hook_path_unlink),
   1194	LSM_HOOK_INIT(path_rmdir, hook_path_rmdir),
   1195
   1196	LSM_HOOK_INIT(file_open, hook_file_open),
   1197};
   1198
   1199__init void landlock_add_fs_hooks(void)
   1200{
   1201	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
   1202			   LANDLOCK_NAME);
   1203}