cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

core.c (35123B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * KFENCE guarded object allocator and fault handling.
      4 *
      5 * Copyright (C) 2020, Google LLC.
      6 */
      7
      8#define pr_fmt(fmt) "kfence: " fmt
      9
     10#include <linux/atomic.h>
     11#include <linux/bug.h>
     12#include <linux/debugfs.h>
     13#include <linux/hash.h>
     14#include <linux/irq_work.h>
     15#include <linux/jhash.h>
     16#include <linux/kcsan-checks.h>
     17#include <linux/kfence.h>
     18#include <linux/kmemleak.h>
     19#include <linux/list.h>
     20#include <linux/lockdep.h>
     21#include <linux/log2.h>
     22#include <linux/memblock.h>
     23#include <linux/moduleparam.h>
     24#include <linux/notifier.h>
     25#include <linux/panic_notifier.h>
     26#include <linux/random.h>
     27#include <linux/rcupdate.h>
     28#include <linux/sched/clock.h>
     29#include <linux/sched/sysctl.h>
     30#include <linux/seq_file.h>
     31#include <linux/slab.h>
     32#include <linux/spinlock.h>
     33#include <linux/string.h>
     34
     35#include <asm/kfence.h>
     36
     37#include "kfence.h"
     38
     39/* Disables KFENCE on the first warning assuming an irrecoverable error. */
     40#define KFENCE_WARN_ON(cond)                                                   \
     41	({                                                                     \
     42		const bool __cond = WARN_ON(cond);                             \
     43		if (unlikely(__cond)) {                                        \
     44			WRITE_ONCE(kfence_enabled, false);                     \
     45			disabled_by_warn = true;                               \
     46		}                                                              \
     47		__cond;                                                        \
     48	})
     49
     50/* === Data ================================================================= */
     51
     52static bool kfence_enabled __read_mostly;
     53static bool disabled_by_warn __read_mostly;
     54
     55unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
     56EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */
     57
     58#ifdef MODULE_PARAM_PREFIX
     59#undef MODULE_PARAM_PREFIX
     60#endif
     61#define MODULE_PARAM_PREFIX "kfence."
     62
     63static int kfence_enable_late(void);
     64static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
     65{
     66	unsigned long num;
     67	int ret = kstrtoul(val, 0, &num);
     68
     69	if (ret < 0)
     70		return ret;
     71
     72	/* Using 0 to indicate KFENCE is disabled. */
     73	if (!num && READ_ONCE(kfence_enabled)) {
     74		pr_info("disabled\n");
     75		WRITE_ONCE(kfence_enabled, false);
     76	}
     77
     78	*((unsigned long *)kp->arg) = num;
     79
     80	if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
     81		return disabled_by_warn ? -EINVAL : kfence_enable_late();
     82	return 0;
     83}
     84
     85static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
     86{
     87	if (!READ_ONCE(kfence_enabled))
     88		return sprintf(buffer, "0\n");
     89
     90	return param_get_ulong(buffer, kp);
     91}
     92
     93static const struct kernel_param_ops sample_interval_param_ops = {
     94	.set = param_set_sample_interval,
     95	.get = param_get_sample_interval,
     96};
     97module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
     98
     99/* Pool usage% threshold when currently covered allocations are skipped. */
    100static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
    101module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
    102
    103/* If true, use a deferrable timer. */
    104static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE);
    105module_param_named(deferrable, kfence_deferrable, bool, 0444);
    106
    107/* If true, check all canary bytes on panic. */
    108static bool kfence_check_on_panic __read_mostly;
    109module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444);
    110
    111/* The pool of pages used for guard pages and objects. */
    112char *__kfence_pool __read_mostly;
    113EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
    114
    115/*
    116 * Per-object metadata, with one-to-one mapping of object metadata to
    117 * backing pages (in __kfence_pool).
    118 */
    119static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
    120struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
    121
    122/* Freelist with available objects. */
    123static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
    124static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
    125
    126/*
    127 * The static key to set up a KFENCE allocation; or if static keys are not used
    128 * to gate allocations, to avoid a load and compare if KFENCE is disabled.
    129 */
    130DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
    131
    132/* Gates the allocation, ensuring only one succeeds in a given period. */
    133atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
    134
    135/*
    136 * A Counting Bloom filter of allocation coverage: limits currently covered
    137 * allocations of the same source filling up the pool.
    138 *
    139 * Assuming a range of 15%-85% unique allocations in the pool at any point in
    140 * time, the below parameters provide a probablity of 0.02-0.33 for false
    141 * positive hits respectively:
    142 *
    143 *	P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
    144 */
    145#define ALLOC_COVERED_HNUM	2
    146#define ALLOC_COVERED_ORDER	(const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
    147#define ALLOC_COVERED_SIZE	(1 << ALLOC_COVERED_ORDER)
    148#define ALLOC_COVERED_HNEXT(h)	hash_32(h, ALLOC_COVERED_ORDER)
    149#define ALLOC_COVERED_MASK	(ALLOC_COVERED_SIZE - 1)
    150static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
    151
    152/* Stack depth used to determine uniqueness of an allocation. */
    153#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
    154
    155/*
    156 * Randomness for stack hashes, making the same collisions across reboots and
    157 * different machines less likely.
    158 */
    159static u32 stack_hash_seed __ro_after_init;
    160
    161/* Statistics counters for debugfs. */
    162enum kfence_counter_id {
    163	KFENCE_COUNTER_ALLOCATED,
    164	KFENCE_COUNTER_ALLOCS,
    165	KFENCE_COUNTER_FREES,
    166	KFENCE_COUNTER_ZOMBIES,
    167	KFENCE_COUNTER_BUGS,
    168	KFENCE_COUNTER_SKIP_INCOMPAT,
    169	KFENCE_COUNTER_SKIP_CAPACITY,
    170	KFENCE_COUNTER_SKIP_COVERED,
    171	KFENCE_COUNTER_COUNT,
    172};
    173static atomic_long_t counters[KFENCE_COUNTER_COUNT];
    174static const char *const counter_names[] = {
    175	[KFENCE_COUNTER_ALLOCATED]	= "currently allocated",
    176	[KFENCE_COUNTER_ALLOCS]		= "total allocations",
    177	[KFENCE_COUNTER_FREES]		= "total frees",
    178	[KFENCE_COUNTER_ZOMBIES]	= "zombie allocations",
    179	[KFENCE_COUNTER_BUGS]		= "total bugs",
    180	[KFENCE_COUNTER_SKIP_INCOMPAT]	= "skipped allocations (incompatible)",
    181	[KFENCE_COUNTER_SKIP_CAPACITY]	= "skipped allocations (capacity)",
    182	[KFENCE_COUNTER_SKIP_COVERED]	= "skipped allocations (covered)",
    183};
    184static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
    185
    186/* === Internals ============================================================ */
    187
    188static inline bool should_skip_covered(void)
    189{
    190	unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
    191
    192	return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
    193}
    194
    195static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
    196{
    197	num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
    198	num_entries = filter_irq_stacks(stack_entries, num_entries);
    199	return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
    200}
    201
    202/*
    203 * Adds (or subtracts) count @val for allocation stack trace hash
    204 * @alloc_stack_hash from Counting Bloom filter.
    205 */
    206static void alloc_covered_add(u32 alloc_stack_hash, int val)
    207{
    208	int i;
    209
    210	for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
    211		atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
    212		alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
    213	}
    214}
    215
    216/*
    217 * Returns true if the allocation stack trace hash @alloc_stack_hash is
    218 * currently contained (non-zero count) in Counting Bloom filter.
    219 */
    220static bool alloc_covered_contains(u32 alloc_stack_hash)
    221{
    222	int i;
    223
    224	for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
    225		if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
    226			return false;
    227		alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
    228	}
    229
    230	return true;
    231}
    232
    233static bool kfence_protect(unsigned long addr)
    234{
    235	return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
    236}
    237
    238static bool kfence_unprotect(unsigned long addr)
    239{
    240	return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
    241}
    242
    243static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
    244{
    245	unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
    246	unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];
    247
    248	/* The checks do not affect performance; only called from slow-paths. */
    249
    250	/* Only call with a pointer into kfence_metadata. */
    251	if (KFENCE_WARN_ON(meta < kfence_metadata ||
    252			   meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
    253		return 0;
    254
    255	/*
    256	 * This metadata object only ever maps to 1 page; verify that the stored
    257	 * address is in the expected range.
    258	 */
    259	if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
    260		return 0;
    261
    262	return pageaddr;
    263}
    264
    265/*
    266 * Update the object's metadata state, including updating the alloc/free stacks
    267 * depending on the state transition.
    268 */
    269static noinline void
    270metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
    271		      unsigned long *stack_entries, size_t num_stack_entries)
    272{
    273	struct kfence_track *track =
    274		next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
    275
    276	lockdep_assert_held(&meta->lock);
    277
    278	if (stack_entries) {
    279		memcpy(track->stack_entries, stack_entries,
    280		       num_stack_entries * sizeof(stack_entries[0]));
    281	} else {
    282		/*
    283		 * Skip over 1 (this) functions; noinline ensures we do not
    284		 * accidentally skip over the caller by never inlining.
    285		 */
    286		num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
    287	}
    288	track->num_stack_entries = num_stack_entries;
    289	track->pid = task_pid_nr(current);
    290	track->cpu = raw_smp_processor_id();
    291	track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
    292
    293	/*
    294	 * Pairs with READ_ONCE() in
    295	 *	kfence_shutdown_cache(),
    296	 *	kfence_handle_page_fault().
    297	 */
    298	WRITE_ONCE(meta->state, next);
    299}
    300
    301/* Write canary byte to @addr. */
    302static inline bool set_canary_byte(u8 *addr)
    303{
    304	*addr = KFENCE_CANARY_PATTERN(addr);
    305	return true;
    306}
    307
    308/* Check canary byte at @addr. */
    309static inline bool check_canary_byte(u8 *addr)
    310{
    311	struct kfence_metadata *meta;
    312	unsigned long flags;
    313
    314	if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
    315		return true;
    316
    317	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
    318
    319	meta = addr_to_metadata((unsigned long)addr);
    320	raw_spin_lock_irqsave(&meta->lock, flags);
    321	kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION);
    322	raw_spin_unlock_irqrestore(&meta->lock, flags);
    323
    324	return false;
    325}
    326
    327/* __always_inline this to ensure we won't do an indirect call to fn. */
    328static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *))
    329{
    330	const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
    331	unsigned long addr;
    332
    333	/*
    334	 * We'll iterate over each canary byte per-side until fn() returns
    335	 * false. However, we'll still iterate over the canary bytes to the
    336	 * right of the object even if there was an error in the canary bytes to
    337	 * the left of the object. Specifically, if check_canary_byte()
    338	 * generates an error, showing both sides might give more clues as to
    339	 * what the error is about when displaying which bytes were corrupted.
    340	 */
    341
    342	/* Apply to left of object. */
    343	for (addr = pageaddr; addr < meta->addr; addr++) {
    344		if (!fn((u8 *)addr))
    345			break;
    346	}
    347
    348	/* Apply to right of object. */
    349	for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) {
    350		if (!fn((u8 *)addr))
    351			break;
    352	}
    353}
    354
    355static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
    356				  unsigned long *stack_entries, size_t num_stack_entries,
    357				  u32 alloc_stack_hash)
    358{
    359	struct kfence_metadata *meta = NULL;
    360	unsigned long flags;
    361	struct slab *slab;
    362	void *addr;
    363	const bool random_right_allocate = prandom_u32_max(2);
    364	const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS &&
    365				  !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS);
    366
    367	/* Try to obtain a free object. */
    368	raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
    369	if (!list_empty(&kfence_freelist)) {
    370		meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
    371		list_del_init(&meta->list);
    372	}
    373	raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
    374	if (!meta) {
    375		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
    376		return NULL;
    377	}
    378
    379	if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
    380		/*
    381		 * This is extremely unlikely -- we are reporting on a
    382		 * use-after-free, which locked meta->lock, and the reporting
    383		 * code via printk calls kmalloc() which ends up in
    384		 * kfence_alloc() and tries to grab the same object that we're
    385		 * reporting on. While it has never been observed, lockdep does
    386		 * report that there is a possibility of deadlock. Fix it by
    387		 * using trylock and bailing out gracefully.
    388		 */
    389		raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
    390		/* Put the object back on the freelist. */
    391		list_add_tail(&meta->list, &kfence_freelist);
    392		raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
    393
    394		return NULL;
    395	}
    396
    397	meta->addr = metadata_to_pageaddr(meta);
    398	/* Unprotect if we're reusing this page. */
    399	if (meta->state == KFENCE_OBJECT_FREED)
    400		kfence_unprotect(meta->addr);
    401
    402	/*
    403	 * Note: for allocations made before RNG initialization, will always
    404	 * return zero. We still benefit from enabling KFENCE as early as
    405	 * possible, even when the RNG is not yet available, as this will allow
    406	 * KFENCE to detect bugs due to earlier allocations. The only downside
    407	 * is that the out-of-bounds accesses detected are deterministic for
    408	 * such allocations.
    409	 */
    410	if (random_right_allocate) {
    411		/* Allocate on the "right" side, re-calculate address. */
    412		meta->addr += PAGE_SIZE - size;
    413		meta->addr = ALIGN_DOWN(meta->addr, cache->align);
    414	}
    415
    416	addr = (void *)meta->addr;
    417
    418	/* Update remaining metadata. */
    419	metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
    420	/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
    421	WRITE_ONCE(meta->cache, cache);
    422	meta->size = size;
    423	meta->alloc_stack_hash = alloc_stack_hash;
    424	raw_spin_unlock_irqrestore(&meta->lock, flags);
    425
    426	alloc_covered_add(alloc_stack_hash, 1);
    427
    428	/* Set required slab fields. */
    429	slab = virt_to_slab((void *)meta->addr);
    430	slab->slab_cache = cache;
    431#if defined(CONFIG_SLUB)
    432	slab->objects = 1;
    433#elif defined(CONFIG_SLAB)
    434	slab->s_mem = addr;
    435#endif
    436
    437	/* Memory initialization. */
    438	for_each_canary(meta, set_canary_byte);
    439
    440	/*
    441	 * We check slab_want_init_on_alloc() ourselves, rather than letting
    442	 * SL*B do the initialization, as otherwise we might overwrite KFENCE's
    443	 * redzone.
    444	 */
    445	if (unlikely(slab_want_init_on_alloc(gfp, cache)))
    446		memzero_explicit(addr, size);
    447	if (cache->ctor)
    448		cache->ctor(addr);
    449
    450	if (random_fault)
    451		kfence_protect(meta->addr); /* Random "faults" by protecting the object. */
    452
    453	atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
    454	atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
    455
    456	return addr;
    457}
    458
    459static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
    460{
    461	struct kcsan_scoped_access assert_page_exclusive;
    462	unsigned long flags;
    463	bool init;
    464
    465	raw_spin_lock_irqsave(&meta->lock, flags);
    466
    467	if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
    468		/* Invalid or double-free, bail out. */
    469		atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
    470		kfence_report_error((unsigned long)addr, false, NULL, meta,
    471				    KFENCE_ERROR_INVALID_FREE);
    472		raw_spin_unlock_irqrestore(&meta->lock, flags);
    473		return;
    474	}
    475
    476	/* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
    477	kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
    478				  KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
    479				  &assert_page_exclusive);
    480
    481	if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
    482		kfence_unprotect((unsigned long)addr); /* To check canary bytes. */
    483
    484	/* Restore page protection if there was an OOB access. */
    485	if (meta->unprotected_page) {
    486		memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
    487		kfence_protect(meta->unprotected_page);
    488		meta->unprotected_page = 0;
    489	}
    490
    491	/* Mark the object as freed. */
    492	metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
    493	init = slab_want_init_on_free(meta->cache);
    494	raw_spin_unlock_irqrestore(&meta->lock, flags);
    495
    496	alloc_covered_add(meta->alloc_stack_hash, -1);
    497
    498	/* Check canary bytes for memory corruption. */
    499	for_each_canary(meta, check_canary_byte);
    500
    501	/*
    502	 * Clear memory if init-on-free is set. While we protect the page, the
    503	 * data is still there, and after a use-after-free is detected, we
    504	 * unprotect the page, so the data is still accessible.
    505	 */
    506	if (!zombie && unlikely(init))
    507		memzero_explicit(addr, meta->size);
    508
    509	/* Protect to detect use-after-frees. */
    510	kfence_protect((unsigned long)addr);
    511
    512	kcsan_end_scoped_access(&assert_page_exclusive);
    513	if (!zombie) {
    514		/* Add it to the tail of the freelist for reuse. */
    515		raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
    516		KFENCE_WARN_ON(!list_empty(&meta->list));
    517		list_add_tail(&meta->list, &kfence_freelist);
    518		raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
    519
    520		atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
    521		atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
    522	} else {
    523		/* See kfence_shutdown_cache(). */
    524		atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
    525	}
    526}
    527
    528static void rcu_guarded_free(struct rcu_head *h)
    529{
    530	struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);
    531
    532	kfence_guarded_free((void *)meta->addr, meta, false);
    533}
    534
    535/*
    536 * Initialization of the KFENCE pool after its allocation.
    537 * Returns 0 on success; otherwise returns the address up to
    538 * which partial initialization succeeded.
    539 */
    540static unsigned long kfence_init_pool(void)
    541{
    542	unsigned long addr = (unsigned long)__kfence_pool;
    543	struct page *pages;
    544	int i;
    545
    546	if (!arch_kfence_init_pool())
    547		return addr;
    548
    549	pages = virt_to_page(addr);
    550
    551	/*
    552	 * Set up object pages: they must have PG_slab set, to avoid freeing
    553	 * these as real pages.
    554	 *
    555	 * We also want to avoid inserting kfence_free() in the kfree()
    556	 * fast-path in SLUB, and therefore need to ensure kfree() correctly
    557	 * enters __slab_free() slow-path.
    558	 */
    559	for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
    560		struct slab *slab = page_slab(&pages[i]);
    561
    562		if (!i || (i % 2))
    563			continue;
    564
    565		/* Verify we do not have a compound head page. */
    566		if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
    567			return addr;
    568
    569		__folio_set_slab(slab_folio(slab));
    570#ifdef CONFIG_MEMCG
    571		slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
    572				   MEMCG_DATA_OBJCGS;
    573#endif
    574	}
    575
    576	/*
    577	 * Protect the first 2 pages. The first page is mostly unnecessary, and
    578	 * merely serves as an extended guard page. However, adding one
    579	 * additional page in the beginning gives us an even number of pages,
    580	 * which simplifies the mapping of address to metadata index.
    581	 */
    582	for (i = 0; i < 2; i++) {
    583		if (unlikely(!kfence_protect(addr)))
    584			return addr;
    585
    586		addr += PAGE_SIZE;
    587	}
    588
    589	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
    590		struct kfence_metadata *meta = &kfence_metadata[i];
    591
    592		/* Initialize metadata. */
    593		INIT_LIST_HEAD(&meta->list);
    594		raw_spin_lock_init(&meta->lock);
    595		meta->state = KFENCE_OBJECT_UNUSED;
    596		meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
    597		list_add_tail(&meta->list, &kfence_freelist);
    598
    599		/* Protect the right redzone. */
    600		if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
    601			return addr;
    602
    603		addr += 2 * PAGE_SIZE;
    604	}
    605
    606	/*
    607	 * The pool is live and will never be deallocated from this point on.
    608	 * Remove the pool object from the kmemleak object tree, as it would
    609	 * otherwise overlap with allocations returned by kfence_alloc(), which
    610	 * are registered with kmemleak through the slab post-alloc hook.
    611	 */
    612	kmemleak_free(__kfence_pool);
    613
    614	return 0;
    615}
    616
    617static bool __init kfence_init_pool_early(void)
    618{
    619	unsigned long addr;
    620
    621	if (!__kfence_pool)
    622		return false;
    623
    624	addr = kfence_init_pool();
    625
    626	if (!addr)
    627		return true;
    628
    629	/*
    630	 * Only release unprotected pages, and do not try to go back and change
    631	 * page attributes due to risk of failing to do so as well. If changing
    632	 * page attributes for some pages fails, it is very likely that it also
    633	 * fails for the first page, and therefore expect addr==__kfence_pool in
    634	 * most failure cases.
    635	 */
    636	for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) {
    637		struct slab *slab = virt_to_slab(p);
    638
    639		if (!slab)
    640			continue;
    641#ifdef CONFIG_MEMCG
    642		slab->memcg_data = 0;
    643#endif
    644		__folio_clear_slab(slab_folio(slab));
    645	}
    646	memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
    647	__kfence_pool = NULL;
    648	return false;
    649}
    650
    651static bool kfence_init_pool_late(void)
    652{
    653	unsigned long addr, free_size;
    654
    655	addr = kfence_init_pool();
    656
    657	if (!addr)
    658		return true;
    659
    660	/* Same as above. */
    661	free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
    662#ifdef CONFIG_CONTIG_ALLOC
    663	free_contig_range(page_to_pfn(virt_to_page(addr)), free_size / PAGE_SIZE);
    664#else
    665	free_pages_exact((void *)addr, free_size);
    666#endif
    667	__kfence_pool = NULL;
    668	return false;
    669}
    670
    671/* === DebugFS Interface ==================================================== */
    672
    673static int stats_show(struct seq_file *seq, void *v)
    674{
    675	int i;
    676
    677	seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
    678	for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
    679		seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));
    680
    681	return 0;
    682}
    683DEFINE_SHOW_ATTRIBUTE(stats);
    684
    685/*
    686 * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
    687 * start_object() and next_object() return the object index + 1, because NULL is used
    688 * to stop iteration.
    689 */
    690static void *start_object(struct seq_file *seq, loff_t *pos)
    691{
    692	if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
    693		return (void *)((long)*pos + 1);
    694	return NULL;
    695}
    696
    697static void stop_object(struct seq_file *seq, void *v)
    698{
    699}
    700
    701static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
    702{
    703	++*pos;
    704	if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
    705		return (void *)((long)*pos + 1);
    706	return NULL;
    707}
    708
    709static int show_object(struct seq_file *seq, void *v)
    710{
    711	struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
    712	unsigned long flags;
    713
    714	raw_spin_lock_irqsave(&meta->lock, flags);
    715	kfence_print_object(seq, meta);
    716	raw_spin_unlock_irqrestore(&meta->lock, flags);
    717	seq_puts(seq, "---------------------------------\n");
    718
    719	return 0;
    720}
    721
    722static const struct seq_operations object_seqops = {
    723	.start = start_object,
    724	.next = next_object,
    725	.stop = stop_object,
    726	.show = show_object,
    727};
    728
    729static int open_objects(struct inode *inode, struct file *file)
    730{
    731	return seq_open(file, &object_seqops);
    732}
    733
    734static const struct file_operations objects_fops = {
    735	.open = open_objects,
    736	.read = seq_read,
    737	.llseek = seq_lseek,
    738	.release = seq_release,
    739};
    740
    741static int __init kfence_debugfs_init(void)
    742{
    743	struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL);
    744
    745	debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
    746	debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
    747	return 0;
    748}
    749
    750late_initcall(kfence_debugfs_init);
    751
    752/* === Panic Notifier ====================================================== */
    753
    754static void kfence_check_all_canary(void)
    755{
    756	int i;
    757
    758	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
    759		struct kfence_metadata *meta = &kfence_metadata[i];
    760
    761		if (meta->state == KFENCE_OBJECT_ALLOCATED)
    762			for_each_canary(meta, check_canary_byte);
    763	}
    764}
    765
    766static int kfence_check_canary_callback(struct notifier_block *nb,
    767					unsigned long reason, void *arg)
    768{
    769	kfence_check_all_canary();
    770	return NOTIFY_OK;
    771}
    772
    773static struct notifier_block kfence_check_canary_notifier = {
    774	.notifier_call = kfence_check_canary_callback,
    775};
    776
    777/* === Allocation Gate Timer ================================================ */
    778
    779static struct delayed_work kfence_timer;
    780
    781#ifdef CONFIG_KFENCE_STATIC_KEYS
    782/* Wait queue to wake up allocation-gate timer task. */
    783static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
    784
    785static void wake_up_kfence_timer(struct irq_work *work)
    786{
    787	wake_up(&allocation_wait);
    788}
    789static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
    790#endif
    791
    792/*
    793 * Set up delayed work, which will enable and disable the static key. We need to
    794 * use a work queue (rather than a simple timer), since enabling and disabling a
    795 * static key cannot be done from an interrupt.
    796 *
    797 * Note: Toggling a static branch currently causes IPIs, and here we'll end up
    798 * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
    799 * more aggressive sampling intervals), we could get away with a variant that
    800 * avoids IPIs, at the cost of not immediately capturing allocations if the
    801 * instructions remain cached.
    802 */
    803static void toggle_allocation_gate(struct work_struct *work)
    804{
    805	if (!READ_ONCE(kfence_enabled))
    806		return;
    807
    808	atomic_set(&kfence_allocation_gate, 0);
    809#ifdef CONFIG_KFENCE_STATIC_KEYS
    810	/* Enable static key, and await allocation to happen. */
    811	static_branch_enable(&kfence_allocation_key);
    812
    813	if (sysctl_hung_task_timeout_secs) {
    814		/*
    815		 * During low activity with no allocations we might wait a
    816		 * while; let's avoid the hung task warning.
    817		 */
    818		wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
    819					sysctl_hung_task_timeout_secs * HZ / 2);
    820	} else {
    821		wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
    822	}
    823
    824	/* Disable static key and reset timer. */
    825	static_branch_disable(&kfence_allocation_key);
    826#endif
    827	queue_delayed_work(system_unbound_wq, &kfence_timer,
    828			   msecs_to_jiffies(kfence_sample_interval));
    829}
    830
    831/* === Public interface ===================================================== */
    832
    833void __init kfence_alloc_pool(void)
    834{
    835	if (!kfence_sample_interval)
    836		return;
    837
    838	__kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
    839
    840	if (!__kfence_pool)
    841		pr_err("failed to allocate pool\n");
    842}
    843
    844static void kfence_init_enable(void)
    845{
    846	if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS))
    847		static_branch_enable(&kfence_allocation_key);
    848
    849	if (kfence_deferrable)
    850		INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate);
    851	else
    852		INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate);
    853
    854	if (kfence_check_on_panic)
    855		atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier);
    856
    857	WRITE_ONCE(kfence_enabled, true);
    858	queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
    859
    860	pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
    861		CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
    862		(void *)(__kfence_pool + KFENCE_POOL_SIZE));
    863}
    864
    865void __init kfence_init(void)
    866{
    867	stack_hash_seed = (u32)random_get_entropy();
    868
    869	/* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
    870	if (!kfence_sample_interval)
    871		return;
    872
    873	if (!kfence_init_pool_early()) {
    874		pr_err("%s failed\n", __func__);
    875		return;
    876	}
    877
    878	kfence_init_enable();
    879}
    880
    881static int kfence_init_late(void)
    882{
    883	const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE;
    884#ifdef CONFIG_CONTIG_ALLOC
    885	struct page *pages;
    886
    887	pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL);
    888	if (!pages)
    889		return -ENOMEM;
    890	__kfence_pool = page_to_virt(pages);
    891#else
    892	if (nr_pages > MAX_ORDER_NR_PAGES) {
    893		pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
    894		return -EINVAL;
    895	}
    896	__kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL);
    897	if (!__kfence_pool)
    898		return -ENOMEM;
    899#endif
    900
    901	if (!kfence_init_pool_late()) {
    902		pr_err("%s failed\n", __func__);
    903		return -EBUSY;
    904	}
    905
    906	kfence_init_enable();
    907	return 0;
    908}
    909
    910static int kfence_enable_late(void)
    911{
    912	if (!__kfence_pool)
    913		return kfence_init_late();
    914
    915	WRITE_ONCE(kfence_enabled, true);
    916	queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
    917	pr_info("re-enabled\n");
    918	return 0;
    919}
    920
    921void kfence_shutdown_cache(struct kmem_cache *s)
    922{
    923	unsigned long flags;
    924	struct kfence_metadata *meta;
    925	int i;
    926
    927	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
    928		bool in_use;
    929
    930		meta = &kfence_metadata[i];
    931
    932		/*
    933		 * If we observe some inconsistent cache and state pair where we
    934		 * should have returned false here, cache destruction is racing
    935		 * with either kmem_cache_alloc() or kmem_cache_free(). Taking
    936		 * the lock will not help, as different critical section
    937		 * serialization will have the same outcome.
    938		 */
    939		if (READ_ONCE(meta->cache) != s ||
    940		    READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
    941			continue;
    942
    943		raw_spin_lock_irqsave(&meta->lock, flags);
    944		in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
    945		raw_spin_unlock_irqrestore(&meta->lock, flags);
    946
    947		if (in_use) {
    948			/*
    949			 * This cache still has allocations, and we should not
    950			 * release them back into the freelist so they can still
    951			 * safely be used and retain the kernel's default
    952			 * behaviour of keeping the allocations alive (leak the
    953			 * cache); however, they effectively become "zombie
    954			 * allocations" as the KFENCE objects are the only ones
    955			 * still in use and the owning cache is being destroyed.
    956			 *
    957			 * We mark them freed, so that any subsequent use shows
    958			 * more useful error messages that will include stack
    959			 * traces of the user of the object, the original
    960			 * allocation, and caller to shutdown_cache().
    961			 */
    962			kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
    963		}
    964	}
    965
    966	for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
    967		meta = &kfence_metadata[i];
    968
    969		/* See above. */
    970		if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
    971			continue;
    972
    973		raw_spin_lock_irqsave(&meta->lock, flags);
    974		if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
    975			meta->cache = NULL;
    976		raw_spin_unlock_irqrestore(&meta->lock, flags);
    977	}
    978}
    979
    980void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
    981{
    982	unsigned long stack_entries[KFENCE_STACK_DEPTH];
    983	size_t num_stack_entries;
    984	u32 alloc_stack_hash;
    985
    986	/*
    987	 * Perform size check before switching kfence_allocation_gate, so that
    988	 * we don't disable KFENCE without making an allocation.
    989	 */
    990	if (size > PAGE_SIZE) {
    991		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
    992		return NULL;
    993	}
    994
    995	/*
    996	 * Skip allocations from non-default zones, including DMA. We cannot
    997	 * guarantee that pages in the KFENCE pool will have the requested
    998	 * properties (e.g. reside in DMAable memory).
    999	 */
   1000	if ((flags & GFP_ZONEMASK) ||
   1001	    (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
   1002		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
   1003		return NULL;
   1004	}
   1005
   1006	if (atomic_inc_return(&kfence_allocation_gate) > 1)
   1007		return NULL;
   1008#ifdef CONFIG_KFENCE_STATIC_KEYS
   1009	/*
   1010	 * waitqueue_active() is fully ordered after the update of
   1011	 * kfence_allocation_gate per atomic_inc_return().
   1012	 */
   1013	if (waitqueue_active(&allocation_wait)) {
   1014		/*
   1015		 * Calling wake_up() here may deadlock when allocations happen
   1016		 * from within timer code. Use an irq_work to defer it.
   1017		 */
   1018		irq_work_queue(&wake_up_kfence_timer_work);
   1019	}
   1020#endif
   1021
   1022	if (!READ_ONCE(kfence_enabled))
   1023		return NULL;
   1024
   1025	num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
   1026
   1027	/*
   1028	 * Do expensive check for coverage of allocation in slow-path after
   1029	 * allocation_gate has already become non-zero, even though it might
   1030	 * mean not making any allocation within a given sample interval.
   1031	 *
   1032	 * This ensures reasonable allocation coverage when the pool is almost
   1033	 * full, including avoiding long-lived allocations of the same source
   1034	 * filling up the pool (e.g. pagecache allocations).
   1035	 */
   1036	alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
   1037	if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
   1038		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
   1039		return NULL;
   1040	}
   1041
   1042	return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
   1043				    alloc_stack_hash);
   1044}
   1045
   1046size_t kfence_ksize(const void *addr)
   1047{
   1048	const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
   1049
   1050	/*
   1051	 * Read locklessly -- if there is a race with __kfence_alloc(), this is
   1052	 * either a use-after-free or invalid access.
   1053	 */
   1054	return meta ? meta->size : 0;
   1055}
   1056
   1057void *kfence_object_start(const void *addr)
   1058{
   1059	const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
   1060
   1061	/*
   1062	 * Read locklessly -- if there is a race with __kfence_alloc(), this is
   1063	 * either a use-after-free or invalid access.
   1064	 */
   1065	return meta ? (void *)meta->addr : NULL;
   1066}
   1067
   1068void __kfence_free(void *addr)
   1069{
   1070	struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
   1071
   1072#ifdef CONFIG_MEMCG
   1073	KFENCE_WARN_ON(meta->objcg);
   1074#endif
   1075	/*
   1076	 * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
   1077	 * the object, as the object page may be recycled for other-typed
   1078	 * objects once it has been freed. meta->cache may be NULL if the cache
   1079	 * was destroyed.
   1080	 */
   1081	if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
   1082		call_rcu(&meta->rcu_head, rcu_guarded_free);
   1083	else
   1084		kfence_guarded_free(addr, meta, false);
   1085}
   1086
   1087bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
   1088{
   1089	const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
   1090	struct kfence_metadata *to_report = NULL;
   1091	enum kfence_error_type error_type;
   1092	unsigned long flags;
   1093
   1094	if (!is_kfence_address((void *)addr))
   1095		return false;
   1096
   1097	if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
   1098		return kfence_unprotect(addr); /* ... unprotect and proceed. */
   1099
   1100	atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
   1101
   1102	if (page_index % 2) {
   1103		/* This is a redzone, report a buffer overflow. */
   1104		struct kfence_metadata *meta;
   1105		int distance = 0;
   1106
   1107		meta = addr_to_metadata(addr - PAGE_SIZE);
   1108		if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
   1109			to_report = meta;
   1110			/* Data race ok; distance calculation approximate. */
   1111			distance = addr - data_race(meta->addr + meta->size);
   1112		}
   1113
   1114		meta = addr_to_metadata(addr + PAGE_SIZE);
   1115		if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
   1116			/* Data race ok; distance calculation approximate. */
   1117			if (!to_report || distance > data_race(meta->addr) - addr)
   1118				to_report = meta;
   1119		}
   1120
   1121		if (!to_report)
   1122			goto out;
   1123
   1124		raw_spin_lock_irqsave(&to_report->lock, flags);
   1125		to_report->unprotected_page = addr;
   1126		error_type = KFENCE_ERROR_OOB;
   1127
   1128		/*
   1129		 * If the object was freed before we took the look we can still
   1130		 * report this as an OOB -- the report will simply show the
   1131		 * stacktrace of the free as well.
   1132		 */
   1133	} else {
   1134		to_report = addr_to_metadata(addr);
   1135		if (!to_report)
   1136			goto out;
   1137
   1138		raw_spin_lock_irqsave(&to_report->lock, flags);
   1139		error_type = KFENCE_ERROR_UAF;
   1140		/*
   1141		 * We may race with __kfence_alloc(), and it is possible that a
   1142		 * freed object may be reallocated. We simply report this as a
   1143		 * use-after-free, with the stack trace showing the place where
   1144		 * the object was re-allocated.
   1145		 */
   1146	}
   1147
   1148out:
   1149	if (to_report) {
   1150		kfence_report_error(addr, is_write, regs, to_report, error_type);
   1151		raw_spin_unlock_irqrestore(&to_report->lock, flags);
   1152	} else {
   1153		/* This may be a UAF or OOB access, but we can't be sure. */
   1154		kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
   1155	}
   1156
   1157	return kfence_unprotect(addr); /* Unprotect and let access proceed. */
   1158}