slub.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
slub.c (159253B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * SLUB: A slab allocator that limits cache line use instead of queuing
      4 * objects in per cpu and per node lists.
      5 *
      6 * The allocator synchronizes using per slab locks or atomic operations
      7 * and only uses a centralized lock to manage a pool of partial slabs.
      8 *
      9 * (C) 2007 SGI, Christoph Lameter
     10 * (C) 2011 Linux Foundation, Christoph Lameter
     11 */
     12
     13#include <linux/mm.h>
     14#include <linux/swap.h> /* struct reclaim_state */
     15#include <linux/module.h>
     16#include <linux/bit_spinlock.h>
     17#include <linux/interrupt.h>
     18#include <linux/swab.h>
     19#include <linux/bitops.h>
     20#include <linux/slab.h>
     21#include "slab.h"
     22#include <linux/proc_fs.h>
     23#include <linux/seq_file.h>
     24#include <linux/kasan.h>
     25#include <linux/cpu.h>
     26#include <linux/cpuset.h>
     27#include <linux/mempolicy.h>
     28#include <linux/ctype.h>
     29#include <linux/stackdepot.h>
     30#include <linux/debugobjects.h>
     31#include <linux/kallsyms.h>
     32#include <linux/kfence.h>
     33#include <linux/memory.h>
     34#include <linux/math64.h>
     35#include <linux/fault-inject.h>
     36#include <linux/stacktrace.h>
     37#include <linux/prefetch.h>
     38#include <linux/memcontrol.h>
     39#include <linux/random.h>
     40#include <kunit/test.h>
     41#include <linux/sort.h>
     42
     43#include <linux/debugfs.h>
     44#include <trace/events/kmem.h>
     45
     46#include "internal.h"
     47
     48/*
     49 * Lock order:
     50 *   1. slab_mutex (Global Mutex)
     51 *   2. node->list_lock (Spinlock)
     52 *   3. kmem_cache->cpu_slab->lock (Local lock)
     53 *   4. slab_lock(slab) (Only on some arches or for debugging)
     54 *   5. object_map_lock (Only for debugging)
     55 *
     56 *   slab_mutex
     57 *
     58 *   The role of the slab_mutex is to protect the list of all the slabs
     59 *   and to synchronize major metadata changes to slab cache structures.
     60 *   Also synchronizes memory hotplug callbacks.
     61 *
     62 *   slab_lock
     63 *
     64 *   The slab_lock is a wrapper around the page lock, thus it is a bit
     65 *   spinlock.
     66 *
     67 *   The slab_lock is only used for debugging and on arches that do not
     68 *   have the ability to do a cmpxchg_double. It only protects:
     69 *	A. slab->freelist	-> List of free objects in a slab
     70 *	B. slab->inuse		-> Number of objects in use
     71 *	C. slab->objects	-> Number of objects in slab
     72 *	D. slab->frozen		-> frozen state
     73 *
     74 *   Frozen slabs
     75 *
     76 *   If a slab is frozen then it is exempt from list management. It is not
     77 *   on any list except per cpu partial list. The processor that froze the
     78 *   slab is the one who can perform list operations on the slab. Other
     79 *   processors may put objects onto the freelist but the processor that
     80 *   froze the slab is the only one that can retrieve the objects from the
     81 *   slab's freelist.
     82 *
     83 *   list_lock
     84 *
     85 *   The list_lock protects the partial and full list on each node and
     86 *   the partial slab counter. If taken then no new slabs may be added or
     87 *   removed from the lists nor make the number of partial slabs be modified.
     88 *   (Note that the total number of slabs is an atomic value that may be
     89 *   modified without taking the list lock).
     90 *
     91 *   The list_lock is a centralized lock and thus we avoid taking it as
     92 *   much as possible. As long as SLUB does not have to handle partial
     93 *   slabs, operations can continue without any centralized lock. F.e.
     94 *   allocating a long series of objects that fill up slabs does not require
     95 *   the list lock.
     96 *
     97 *   cpu_slab->lock local lock
     98 *
     99 *   This locks protect slowpath manipulation of all kmem_cache_cpu fields
    100 *   except the stat counters. This is a percpu structure manipulated only by
    101 *   the local cpu, so the lock protects against being preempted or interrupted
    102 *   by an irq. Fast path operations rely on lockless operations instead.
    103 *   On PREEMPT_RT, the local lock does not actually disable irqs (and thus
    104 *   prevent the lockless operations), so fastpath operations also need to take
    105 *   the lock and are no longer lockless.
    106 *
    107 *   lockless fastpaths
    108 *
    109 *   The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
    110 *   are fully lockless when satisfied from the percpu slab (and when
    111 *   cmpxchg_double is possible to use, otherwise slab_lock is taken).
    112 *   They also don't disable preemption or migration or irqs. They rely on
    113 *   the transaction id (tid) field to detect being preempted or moved to
    114 *   another cpu.
    115 *
    116 *   irq, preemption, migration considerations
    117 *
    118 *   Interrupts are disabled as part of list_lock or local_lock operations, or
    119 *   around the slab_lock operation, in order to make the slab allocator safe
    120 *   to use in the context of an irq.
    121 *
    122 *   In addition, preemption (or migration on PREEMPT_RT) is disabled in the
    123 *   allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
    124 *   local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
    125 *   doesn't have to be revalidated in each section protected by the local lock.
    126 *
    127 * SLUB assigns one slab for allocation to each processor.
    128 * Allocations only occur from these slabs called cpu slabs.
    129 *
    130 * Slabs with free elements are kept on a partial list and during regular
    131 * operations no list for full slabs is used. If an object in a full slab is
    132 * freed then the slab will show up again on the partial lists.
    133 * We track full slabs for debugging purposes though because otherwise we
    134 * cannot scan all objects.
    135 *
    136 * Slabs are freed when they become empty. Teardown and setup is
    137 * minimal so we rely on the page allocators per cpu caches for
    138 * fast frees and allocs.
    139 *
    140 * slab->frozen		The slab is frozen and exempt from list processing.
    141 * 			This means that the slab is dedicated to a purpose
    142 * 			such as satisfying allocations for a specific
    143 * 			processor. Objects may be freed in the slab while
    144 * 			it is frozen but slab_free will then skip the usual
    145 * 			list operations. It is up to the processor holding
    146 * 			the slab to integrate the slab into the slab lists
    147 * 			when the slab is no longer needed.
    148 *
    149 * 			One use of this flag is to mark slabs that are
    150 * 			used for allocations. Then such a slab becomes a cpu
    151 * 			slab. The cpu slab may be equipped with an additional
    152 * 			freelist that allows lockless access to
    153 * 			free objects in addition to the regular freelist
    154 * 			that requires the slab lock.
    155 *
    156 * SLAB_DEBUG_FLAGS	Slab requires special handling due to debug
    157 * 			options set. This moves	slab handling out of
    158 * 			the fast path and disables lockless freelists.
    159 */
    160
    161/*
    162 * We could simply use migrate_disable()/enable() but as long as it's a
    163 * function call even on !PREEMPT_RT, use inline preempt_disable() there.
    164 */
    165#ifndef CONFIG_PREEMPT_RT
    166#define slub_get_cpu_ptr(var)	get_cpu_ptr(var)
    167#define slub_put_cpu_ptr(var)	put_cpu_ptr(var)
    168#else
    169#define slub_get_cpu_ptr(var)		\
    170({					\
    171	migrate_disable();		\
    172	this_cpu_ptr(var);		\
    173})
    174#define slub_put_cpu_ptr(var)		\
    175do {					\
    176	(void)(var);			\
    177	migrate_enable();		\
    178} while (0)
    179#endif
    180
    181#ifdef CONFIG_SLUB_DEBUG
    182#ifdef CONFIG_SLUB_DEBUG_ON
    183DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
    184#else
    185DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
    186#endif
    187#endif		/* CONFIG_SLUB_DEBUG */
    188
    189static inline bool kmem_cache_debug(struct kmem_cache *s)
    190{
    191	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
    192}
    193
    194void *fixup_red_left(struct kmem_cache *s, void *p)
    195{
    196	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
    197		p += s->red_left_pad;
    198
    199	return p;
    200}
    201
    202static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
    203{
    204#ifdef CONFIG_SLUB_CPU_PARTIAL
    205	return !kmem_cache_debug(s);
    206#else
    207	return false;
    208#endif
    209}
    210
    211/*
    212 * Issues still to be resolved:
    213 *
    214 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
    215 *
    216 * - Variable sizing of the per node arrays
    217 */
    218
    219/* Enable to log cmpxchg failures */
    220#undef SLUB_DEBUG_CMPXCHG
    221
    222/*
    223 * Minimum number of partial slabs. These will be left on the partial
    224 * lists even if they are empty. kmem_cache_shrink may reclaim them.
    225 */
    226#define MIN_PARTIAL 5
    227
    228/*
    229 * Maximum number of desirable partial slabs.
    230 * The existence of more partial slabs makes kmem_cache_shrink
    231 * sort the partial list by the number of objects in use.
    232 */
    233#define MAX_PARTIAL 10
    234
    235#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
    236				SLAB_POISON | SLAB_STORE_USER)
    237
    238/*
    239 * These debug flags cannot use CMPXCHG because there might be consistency
    240 * issues when checking or reading debug information
    241 */
    242#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
    243				SLAB_TRACE)
    244
    245
    246/*
    247 * Debugging flags that require metadata to be stored in the slab.  These get
    248 * disabled when slub_debug=O is used and a cache's min order increases with
    249 * metadata.
    250 */
    251#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
    252
    253#define OO_SHIFT	16
    254#define OO_MASK		((1 << OO_SHIFT) - 1)
    255#define MAX_OBJS_PER_PAGE	32767 /* since slab.objects is u15 */
    256
    257/* Internal SLUB flags */
    258/* Poison object */
    259#define __OBJECT_POISON		((slab_flags_t __force)0x80000000U)
    260/* Use cmpxchg_double */
    261#define __CMPXCHG_DOUBLE	((slab_flags_t __force)0x40000000U)
    262
    263/*
    264 * Tracking user of a slab.
    265 */
    266#define TRACK_ADDRS_COUNT 16
    267struct track {
    268	unsigned long addr;	/* Called from address */
    269#ifdef CONFIG_STACKDEPOT
    270	depot_stack_handle_t handle;
    271#endif
    272	int cpu;		/* Was running on cpu */
    273	int pid;		/* Pid context */
    274	unsigned long when;	/* When did the operation occur */
    275};
    276
    277enum track_item { TRACK_ALLOC, TRACK_FREE };
    278
    279#ifdef CONFIG_SYSFS
    280static int sysfs_slab_add(struct kmem_cache *);
    281static int sysfs_slab_alias(struct kmem_cache *, const char *);
    282#else
    283static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
    284static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
    285							{ return 0; }
    286#endif
    287
    288#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
    289static void debugfs_slab_add(struct kmem_cache *);
    290#else
    291static inline void debugfs_slab_add(struct kmem_cache *s) { }
    292#endif
    293
    294static inline void stat(const struct kmem_cache *s, enum stat_item si)
    295{
    296#ifdef CONFIG_SLUB_STATS
    297	/*
    298	 * The rmw is racy on a preemptible kernel but this is acceptable, so
    299	 * avoid this_cpu_add()'s irq-disable overhead.
    300	 */
    301	raw_cpu_inc(s->cpu_slab->stat[si]);
    302#endif
    303}
    304
    305/*
    306 * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
    307 * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
    308 * differ during memory hotplug/hotremove operations.
    309 * Protected by slab_mutex.
    310 */
    311static nodemask_t slab_nodes;
    312
    313/********************************************************************
    314 * 			Core slab cache functions
    315 *******************************************************************/
    316
    317/*
    318 * Returns freelist pointer (ptr). With hardening, this is obfuscated
    319 * with an XOR of the address where the pointer is held and a per-cache
    320 * random number.
    321 */
    322static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
    323				 unsigned long ptr_addr)
    324{
    325#ifdef CONFIG_SLAB_FREELIST_HARDENED
    326	/*
    327	 * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
    328	 * Normally, this doesn't cause any issues, as both set_freepointer()
    329	 * and get_freepointer() are called with a pointer with the same tag.
    330	 * However, there are some issues with CONFIG_SLUB_DEBUG code. For
    331	 * example, when __free_slub() iterates over objects in a cache, it
    332	 * passes untagged pointers to check_object(). check_object() in turns
    333	 * calls get_freepointer() with an untagged pointer, which causes the
    334	 * freepointer to be restored incorrectly.
    335	 */
    336	return (void *)((unsigned long)ptr ^ s->random ^
    337			swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
    338#else
    339	return ptr;
    340#endif
    341}
    342
    343/* Returns the freelist pointer recorded at location ptr_addr. */
    344static inline void *freelist_dereference(const struct kmem_cache *s,
    345					 void *ptr_addr)
    346{
    347	return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
    348			    (unsigned long)ptr_addr);
    349}
    350
    351static inline void *get_freepointer(struct kmem_cache *s, void *object)
    352{
    353	object = kasan_reset_tag(object);
    354	return freelist_dereference(s, object + s->offset);
    355}
    356
    357static void prefetch_freepointer(const struct kmem_cache *s, void *object)
    358{
    359	prefetchw(object + s->offset);
    360}
    361
    362static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
    363{
    364	unsigned long freepointer_addr;
    365	void *p;
    366
    367	if (!debug_pagealloc_enabled_static())
    368		return get_freepointer(s, object);
    369
    370	object = kasan_reset_tag(object);
    371	freepointer_addr = (unsigned long)object + s->offset;
    372	copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
    373	return freelist_ptr(s, p, freepointer_addr);
    374}
    375
    376static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
    377{
    378	unsigned long freeptr_addr = (unsigned long)object + s->offset;
    379
    380#ifdef CONFIG_SLAB_FREELIST_HARDENED
    381	BUG_ON(object == fp); /* naive detection of double free or corruption */
    382#endif
    383
    384	freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
    385	*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
    386}
    387
    388/* Loop over all objects in a slab */
    389#define for_each_object(__p, __s, __addr, __objects) \
    390	for (__p = fixup_red_left(__s, __addr); \
    391		__p < (__addr) + (__objects) * (__s)->size; \
    392		__p += (__s)->size)
    393
    394static inline unsigned int order_objects(unsigned int order, unsigned int size)
    395{
    396	return ((unsigned int)PAGE_SIZE << order) / size;
    397}
    398
    399static inline struct kmem_cache_order_objects oo_make(unsigned int order,
    400		unsigned int size)
    401{
    402	struct kmem_cache_order_objects x = {
    403		(order << OO_SHIFT) + order_objects(order, size)
    404	};
    405
    406	return x;
    407}
    408
    409static inline unsigned int oo_order(struct kmem_cache_order_objects x)
    410{
    411	return x.x >> OO_SHIFT;
    412}
    413
    414static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
    415{
    416	return x.x & OO_MASK;
    417}
    418
    419#ifdef CONFIG_SLUB_CPU_PARTIAL
    420static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
    421{
    422	unsigned int nr_slabs;
    423
    424	s->cpu_partial = nr_objects;
    425
    426	/*
    427	 * We take the number of objects but actually limit the number of
    428	 * slabs on the per cpu partial list, in order to limit excessive
    429	 * growth of the list. For simplicity we assume that the slabs will
    430	 * be half-full.
    431	 */
    432	nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
    433	s->cpu_partial_slabs = nr_slabs;
    434}
    435#else
    436static inline void
    437slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
    438{
    439}
    440#endif /* CONFIG_SLUB_CPU_PARTIAL */
    441
    442/*
    443 * Per slab locking using the pagelock
    444 */
    445static __always_inline void __slab_lock(struct slab *slab)
    446{
    447	struct page *page = slab_page(slab);
    448
    449	VM_BUG_ON_PAGE(PageTail(page), page);
    450	bit_spin_lock(PG_locked, &page->flags);
    451}
    452
    453static __always_inline void __slab_unlock(struct slab *slab)
    454{
    455	struct page *page = slab_page(slab);
    456
    457	VM_BUG_ON_PAGE(PageTail(page), page);
    458	__bit_spin_unlock(PG_locked, &page->flags);
    459}
    460
    461static __always_inline void slab_lock(struct slab *slab, unsigned long *flags)
    462{
    463	if (IS_ENABLED(CONFIG_PREEMPT_RT))
    464		local_irq_save(*flags);
    465	__slab_lock(slab);
    466}
    467
    468static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags)
    469{
    470	__slab_unlock(slab);
    471	if (IS_ENABLED(CONFIG_PREEMPT_RT))
    472		local_irq_restore(*flags);
    473}
    474
    475/*
    476 * Interrupts must be disabled (for the fallback code to work right), typically
    477 * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
    478 * so we disable interrupts as part of slab_[un]lock().
    479 */
    480static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
    481		void *freelist_old, unsigned long counters_old,
    482		void *freelist_new, unsigned long counters_new,
    483		const char *n)
    484{
    485	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
    486		lockdep_assert_irqs_disabled();
    487#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    488    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
    489	if (s->flags & __CMPXCHG_DOUBLE) {
    490		if (cmpxchg_double(&slab->freelist, &slab->counters,
    491				   freelist_old, counters_old,
    492				   freelist_new, counters_new))
    493			return true;
    494	} else
    495#endif
    496	{
    497		/* init to 0 to prevent spurious warnings */
    498		unsigned long flags = 0;
    499
    500		slab_lock(slab, &flags);
    501		if (slab->freelist == freelist_old &&
    502					slab->counters == counters_old) {
    503			slab->freelist = freelist_new;
    504			slab->counters = counters_new;
    505			slab_unlock(slab, &flags);
    506			return true;
    507		}
    508		slab_unlock(slab, &flags);
    509	}
    510
    511	cpu_relax();
    512	stat(s, CMPXCHG_DOUBLE_FAIL);
    513
    514#ifdef SLUB_DEBUG_CMPXCHG
    515	pr_info("%s %s: cmpxchg double redo ", n, s->name);
    516#endif
    517
    518	return false;
    519}
    520
    521static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
    522		void *freelist_old, unsigned long counters_old,
    523		void *freelist_new, unsigned long counters_new,
    524		const char *n)
    525{
    526#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    527    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
    528	if (s->flags & __CMPXCHG_DOUBLE) {
    529		if (cmpxchg_double(&slab->freelist, &slab->counters,
    530				   freelist_old, counters_old,
    531				   freelist_new, counters_new))
    532			return true;
    533	} else
    534#endif
    535	{
    536		unsigned long flags;
    537
    538		local_irq_save(flags);
    539		__slab_lock(slab);
    540		if (slab->freelist == freelist_old &&
    541					slab->counters == counters_old) {
    542			slab->freelist = freelist_new;
    543			slab->counters = counters_new;
    544			__slab_unlock(slab);
    545			local_irq_restore(flags);
    546			return true;
    547		}
    548		__slab_unlock(slab);
    549		local_irq_restore(flags);
    550	}
    551
    552	cpu_relax();
    553	stat(s, CMPXCHG_DOUBLE_FAIL);
    554
    555#ifdef SLUB_DEBUG_CMPXCHG
    556	pr_info("%s %s: cmpxchg double redo ", n, s->name);
    557#endif
    558
    559	return false;
    560}
    561
    562#ifdef CONFIG_SLUB_DEBUG
    563static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
    564static DEFINE_RAW_SPINLOCK(object_map_lock);
    565
    566static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
    567		       struct slab *slab)
    568{
    569	void *addr = slab_address(slab);
    570	void *p;
    571
    572	bitmap_zero(obj_map, slab->objects);
    573
    574	for (p = slab->freelist; p; p = get_freepointer(s, p))
    575		set_bit(__obj_to_index(s, addr, p), obj_map);
    576}
    577
    578#if IS_ENABLED(CONFIG_KUNIT)
    579static bool slab_add_kunit_errors(void)
    580{
    581	struct kunit_resource *resource;
    582
    583	if (likely(!current->kunit_test))
    584		return false;
    585
    586	resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
    587	if (!resource)
    588		return false;
    589
    590	(*(int *)resource->data)++;
    591	kunit_put_resource(resource);
    592	return true;
    593}
    594#else
    595static inline bool slab_add_kunit_errors(void) { return false; }
    596#endif
    597
    598/*
    599 * Determine a map of objects in use in a slab.
    600 *
    601 * Node listlock must be held to guarantee that the slab does
    602 * not vanish from under us.
    603 */
    604static unsigned long *get_map(struct kmem_cache *s, struct slab *slab)
    605	__acquires(&object_map_lock)
    606{
    607	VM_BUG_ON(!irqs_disabled());
    608
    609	raw_spin_lock(&object_map_lock);
    610
    611	__fill_map(object_map, s, slab);
    612
    613	return object_map;
    614}
    615
    616static void put_map(unsigned long *map) __releases(&object_map_lock)
    617{
    618	VM_BUG_ON(map != object_map);
    619	raw_spin_unlock(&object_map_lock);
    620}
    621
    622static inline unsigned int size_from_object(struct kmem_cache *s)
    623{
    624	if (s->flags & SLAB_RED_ZONE)
    625		return s->size - s->red_left_pad;
    626
    627	return s->size;
    628}
    629
    630static inline void *restore_red_left(struct kmem_cache *s, void *p)
    631{
    632	if (s->flags & SLAB_RED_ZONE)
    633		p -= s->red_left_pad;
    634
    635	return p;
    636}
    637
    638/*
    639 * Debug settings:
    640 */
    641#if defined(CONFIG_SLUB_DEBUG_ON)
    642static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
    643#else
    644static slab_flags_t slub_debug;
    645#endif
    646
    647static char *slub_debug_string;
    648static int disable_higher_order_debug;
    649
    650/*
    651 * slub is about to manipulate internal object metadata.  This memory lies
    652 * outside the range of the allocated object, so accessing it would normally
    653 * be reported by kasan as a bounds error.  metadata_access_enable() is used
    654 * to tell kasan that these accesses are OK.
    655 */
    656static inline void metadata_access_enable(void)
    657{
    658	kasan_disable_current();
    659}
    660
    661static inline void metadata_access_disable(void)
    662{
    663	kasan_enable_current();
    664}
    665
    666/*
    667 * Object debugging
    668 */
    669
    670/* Verify that a pointer has an address that is valid within a slab page */
    671static inline int check_valid_pointer(struct kmem_cache *s,
    672				struct slab *slab, void *object)
    673{
    674	void *base;
    675
    676	if (!object)
    677		return 1;
    678
    679	base = slab_address(slab);
    680	object = kasan_reset_tag(object);
    681	object = restore_red_left(s, object);
    682	if (object < base || object >= base + slab->objects * s->size ||
    683		(object - base) % s->size) {
    684		return 0;
    685	}
    686
    687	return 1;
    688}
    689
    690static void print_section(char *level, char *text, u8 *addr,
    691			  unsigned int length)
    692{
    693	metadata_access_enable();
    694	print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
    695			16, 1, kasan_reset_tag((void *)addr), length, 1);
    696	metadata_access_disable();
    697}
    698
    699/*
    700 * See comment in calculate_sizes().
    701 */
    702static inline bool freeptr_outside_object(struct kmem_cache *s)
    703{
    704	return s->offset >= s->inuse;
    705}
    706
    707/*
    708 * Return offset of the end of info block which is inuse + free pointer if
    709 * not overlapping with object.
    710 */
    711static inline unsigned int get_info_end(struct kmem_cache *s)
    712{
    713	if (freeptr_outside_object(s))
    714		return s->inuse + sizeof(void *);
    715	else
    716		return s->inuse;
    717}
    718
    719static struct track *get_track(struct kmem_cache *s, void *object,
    720	enum track_item alloc)
    721{
    722	struct track *p;
    723
    724	p = object + get_info_end(s);
    725
    726	return kasan_reset_tag(p + alloc);
    727}
    728
    729#ifdef CONFIG_STACKDEPOT
    730static noinline depot_stack_handle_t set_track_prepare(void)
    731{
    732	depot_stack_handle_t handle;
    733	unsigned long entries[TRACK_ADDRS_COUNT];
    734	unsigned int nr_entries;
    735
    736	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
    737	handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
    738
    739	return handle;
    740}
    741#else
    742static inline depot_stack_handle_t set_track_prepare(void)
    743{
    744	return 0;
    745}
    746#endif
    747
    748static void set_track_update(struct kmem_cache *s, void *object,
    749			     enum track_item alloc, unsigned long addr,
    750			     depot_stack_handle_t handle)
    751{
    752	struct track *p = get_track(s, object, alloc);
    753
    754#ifdef CONFIG_STACKDEPOT
    755	p->handle = handle;
    756#endif
    757	p->addr = addr;
    758	p->cpu = smp_processor_id();
    759	p->pid = current->pid;
    760	p->when = jiffies;
    761}
    762
    763static __always_inline void set_track(struct kmem_cache *s, void *object,
    764				      enum track_item alloc, unsigned long addr)
    765{
    766	depot_stack_handle_t handle = set_track_prepare();
    767
    768	set_track_update(s, object, alloc, addr, handle);
    769}
    770
    771static void init_tracking(struct kmem_cache *s, void *object)
    772{
    773	struct track *p;
    774
    775	if (!(s->flags & SLAB_STORE_USER))
    776		return;
    777
    778	p = get_track(s, object, TRACK_ALLOC);
    779	memset(p, 0, 2*sizeof(struct track));
    780}
    781
    782static void print_track(const char *s, struct track *t, unsigned long pr_time)
    783{
    784	depot_stack_handle_t handle __maybe_unused;
    785
    786	if (!t->addr)
    787		return;
    788
    789	pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
    790	       s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
    791#ifdef CONFIG_STACKDEPOT
    792	handle = READ_ONCE(t->handle);
    793	if (handle)
    794		stack_depot_print(handle);
    795	else
    796		pr_err("object allocation/free stack trace missing\n");
    797#endif
    798}
    799
    800void print_tracking(struct kmem_cache *s, void *object)
    801{
    802	unsigned long pr_time = jiffies;
    803	if (!(s->flags & SLAB_STORE_USER))
    804		return;
    805
    806	print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
    807	print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
    808}
    809
    810static void print_slab_info(const struct slab *slab)
    811{
    812	struct folio *folio = (struct folio *)slab_folio(slab);
    813
    814	pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
    815	       slab, slab->objects, slab->inuse, slab->freelist,
    816	       folio_flags(folio, 0));
    817}
    818
    819static void slab_bug(struct kmem_cache *s, char *fmt, ...)
    820{
    821	struct va_format vaf;
    822	va_list args;
    823
    824	va_start(args, fmt);
    825	vaf.fmt = fmt;
    826	vaf.va = &args;
    827	pr_err("=============================================================================\n");
    828	pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
    829	pr_err("-----------------------------------------------------------------------------\n\n");
    830	va_end(args);
    831}
    832
    833__printf(2, 3)
    834static void slab_fix(struct kmem_cache *s, char *fmt, ...)
    835{
    836	struct va_format vaf;
    837	va_list args;
    838
    839	if (slab_add_kunit_errors())
    840		return;
    841
    842	va_start(args, fmt);
    843	vaf.fmt = fmt;
    844	vaf.va = &args;
    845	pr_err("FIX %s: %pV\n", s->name, &vaf);
    846	va_end(args);
    847}
    848
    849static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
    850{
    851	unsigned int off;	/* Offset of last byte */
    852	u8 *addr = slab_address(slab);
    853
    854	print_tracking(s, p);
    855
    856	print_slab_info(slab);
    857
    858	pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
    859	       p, p - addr, get_freepointer(s, p));
    860
    861	if (s->flags & SLAB_RED_ZONE)
    862		print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
    863			      s->red_left_pad);
    864	else if (p > addr + 16)
    865		print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
    866
    867	print_section(KERN_ERR,         "Object   ", p,
    868		      min_t(unsigned int, s->object_size, PAGE_SIZE));
    869	if (s->flags & SLAB_RED_ZONE)
    870		print_section(KERN_ERR, "Redzone  ", p + s->object_size,
    871			s->inuse - s->object_size);
    872
    873	off = get_info_end(s);
    874
    875	if (s->flags & SLAB_STORE_USER)
    876		off += 2 * sizeof(struct track);
    877
    878	off += kasan_metadata_size(s);
    879
    880	if (off != size_from_object(s))
    881		/* Beginning of the filler is the free pointer */
    882		print_section(KERN_ERR, "Padding  ", p + off,
    883			      size_from_object(s) - off);
    884
    885	dump_stack();
    886}
    887
    888static void object_err(struct kmem_cache *s, struct slab *slab,
    889			u8 *object, char *reason)
    890{
    891	if (slab_add_kunit_errors())
    892		return;
    893
    894	slab_bug(s, "%s", reason);
    895	print_trailer(s, slab, object);
    896	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
    897}
    898
    899static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
    900			       void **freelist, void *nextfree)
    901{
    902	if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
    903	    !check_valid_pointer(s, slab, nextfree) && freelist) {
    904		object_err(s, slab, *freelist, "Freechain corrupt");
    905		*freelist = NULL;
    906		slab_fix(s, "Isolate corrupted freechain");
    907		return true;
    908	}
    909
    910	return false;
    911}
    912
    913static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
    914			const char *fmt, ...)
    915{
    916	va_list args;
    917	char buf[100];
    918
    919	if (slab_add_kunit_errors())
    920		return;
    921
    922	va_start(args, fmt);
    923	vsnprintf(buf, sizeof(buf), fmt, args);
    924	va_end(args);
    925	slab_bug(s, "%s", buf);
    926	print_slab_info(slab);
    927	dump_stack();
    928	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
    929}
    930
    931static void init_object(struct kmem_cache *s, void *object, u8 val)
    932{
    933	u8 *p = kasan_reset_tag(object);
    934
    935	if (s->flags & SLAB_RED_ZONE)
    936		memset(p - s->red_left_pad, val, s->red_left_pad);
    937
    938	if (s->flags & __OBJECT_POISON) {
    939		memset(p, POISON_FREE, s->object_size - 1);
    940		p[s->object_size - 1] = POISON_END;
    941	}
    942
    943	if (s->flags & SLAB_RED_ZONE)
    944		memset(p + s->object_size, val, s->inuse - s->object_size);
    945}
    946
    947static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
    948						void *from, void *to)
    949{
    950	slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
    951	memset(from, data, to - from);
    952}
    953
    954static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
    955			u8 *object, char *what,
    956			u8 *start, unsigned int value, unsigned int bytes)
    957{
    958	u8 *fault;
    959	u8 *end;
    960	u8 *addr = slab_address(slab);
    961
    962	metadata_access_enable();
    963	fault = memchr_inv(kasan_reset_tag(start), value, bytes);
    964	metadata_access_disable();
    965	if (!fault)
    966		return 1;
    967
    968	end = start + bytes;
    969	while (end > fault && end[-1] == value)
    970		end--;
    971
    972	if (slab_add_kunit_errors())
    973		goto skip_bug_print;
    974
    975	slab_bug(s, "%s overwritten", what);
    976	pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
    977					fault, end - 1, fault - addr,
    978					fault[0], value);
    979	print_trailer(s, slab, object);
    980	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
    981
    982skip_bug_print:
    983	restore_bytes(s, what, value, fault, end);
    984	return 0;
    985}
    986
    987/*
    988 * Object layout:
    989 *
    990 * object address
    991 * 	Bytes of the object to be managed.
    992 * 	If the freepointer may overlay the object then the free
    993 *	pointer is at the middle of the object.
    994 *
    995 * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
    996 * 	0xa5 (POISON_END)
    997 *
    998 * object + s->object_size
    999 * 	Padding to reach word boundary. This is also used for Redzoning.
   1000 * 	Padding is extended by another word if Redzoning is enabled and
   1001 * 	object_size == inuse.
   1002 *
   1003 * 	We fill with 0xbb (RED_INACTIVE) for inactive objects and with
   1004 * 	0xcc (RED_ACTIVE) for objects in use.
   1005 *
   1006 * object + s->inuse
   1007 * 	Meta data starts here.
   1008 *
   1009 * 	A. Free pointer (if we cannot overwrite object on free)
   1010 * 	B. Tracking data for SLAB_STORE_USER
   1011 *	C. Padding to reach required alignment boundary or at minimum
   1012 * 		one word if debugging is on to be able to detect writes
   1013 * 		before the word boundary.
   1014 *
   1015 *	Padding is done using 0x5a (POISON_INUSE)
   1016 *
   1017 * object + s->size
   1018 * 	Nothing is used beyond s->size.
   1019 *
   1020 * If slabcaches are merged then the object_size and inuse boundaries are mostly
   1021 * ignored. And therefore no slab options that rely on these boundaries
   1022 * may be used with merged slabcaches.
   1023 */
   1024
   1025static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
   1026{
   1027	unsigned long off = get_info_end(s);	/* The end of info */
   1028
   1029	if (s->flags & SLAB_STORE_USER)
   1030		/* We also have user information there */
   1031		off += 2 * sizeof(struct track);
   1032
   1033	off += kasan_metadata_size(s);
   1034
   1035	if (size_from_object(s) == off)
   1036		return 1;
   1037
   1038	return check_bytes_and_report(s, slab, p, "Object padding",
   1039			p + off, POISON_INUSE, size_from_object(s) - off);
   1040}
   1041
   1042/* Check the pad bytes at the end of a slab page */
   1043static void slab_pad_check(struct kmem_cache *s, struct slab *slab)
   1044{
   1045	u8 *start;
   1046	u8 *fault;
   1047	u8 *end;
   1048	u8 *pad;
   1049	int length;
   1050	int remainder;
   1051
   1052	if (!(s->flags & SLAB_POISON))
   1053		return;
   1054
   1055	start = slab_address(slab);
   1056	length = slab_size(slab);
   1057	end = start + length;
   1058	remainder = length % s->size;
   1059	if (!remainder)
   1060		return;
   1061
   1062	pad = end - remainder;
   1063	metadata_access_enable();
   1064	fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
   1065	metadata_access_disable();
   1066	if (!fault)
   1067		return;
   1068	while (end > fault && end[-1] == POISON_INUSE)
   1069		end--;
   1070
   1071	slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
   1072			fault, end - 1, fault - start);
   1073	print_section(KERN_ERR, "Padding ", pad, remainder);
   1074
   1075	restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
   1076}
   1077
   1078static int check_object(struct kmem_cache *s, struct slab *slab,
   1079					void *object, u8 val)
   1080{
   1081	u8 *p = object;
   1082	u8 *endobject = object + s->object_size;
   1083
   1084	if (s->flags & SLAB_RED_ZONE) {
   1085		if (!check_bytes_and_report(s, slab, object, "Left Redzone",
   1086			object - s->red_left_pad, val, s->red_left_pad))
   1087			return 0;
   1088
   1089		if (!check_bytes_and_report(s, slab, object, "Right Redzone",
   1090			endobject, val, s->inuse - s->object_size))
   1091			return 0;
   1092	} else {
   1093		if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
   1094			check_bytes_and_report(s, slab, p, "Alignment padding",
   1095				endobject, POISON_INUSE,
   1096				s->inuse - s->object_size);
   1097		}
   1098	}
   1099
   1100	if (s->flags & SLAB_POISON) {
   1101		if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
   1102			(!check_bytes_and_report(s, slab, p, "Poison", p,
   1103					POISON_FREE, s->object_size - 1) ||
   1104			 !check_bytes_and_report(s, slab, p, "End Poison",
   1105				p + s->object_size - 1, POISON_END, 1)))
   1106			return 0;
   1107		/*
   1108		 * check_pad_bytes cleans up on its own.
   1109		 */
   1110		check_pad_bytes(s, slab, p);
   1111	}
   1112
   1113	if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
   1114		/*
   1115		 * Object and freepointer overlap. Cannot check
   1116		 * freepointer while object is allocated.
   1117		 */
   1118		return 1;
   1119
   1120	/* Check free pointer validity */
   1121	if (!check_valid_pointer(s, slab, get_freepointer(s, p))) {
   1122		object_err(s, slab, p, "Freepointer corrupt");
   1123		/*
   1124		 * No choice but to zap it and thus lose the remainder
   1125		 * of the free objects in this slab. May cause
   1126		 * another error because the object count is now wrong.
   1127		 */
   1128		set_freepointer(s, p, NULL);
   1129		return 0;
   1130	}
   1131	return 1;
   1132}
   1133
   1134static int check_slab(struct kmem_cache *s, struct slab *slab)
   1135{
   1136	int maxobj;
   1137
   1138	if (!folio_test_slab(slab_folio(slab))) {
   1139		slab_err(s, slab, "Not a valid slab page");
   1140		return 0;
   1141	}
   1142
   1143	maxobj = order_objects(slab_order(slab), s->size);
   1144	if (slab->objects > maxobj) {
   1145		slab_err(s, slab, "objects %u > max %u",
   1146			slab->objects, maxobj);
   1147		return 0;
   1148	}
   1149	if (slab->inuse > slab->objects) {
   1150		slab_err(s, slab, "inuse %u > max %u",
   1151			slab->inuse, slab->objects);
   1152		return 0;
   1153	}
   1154	/* Slab_pad_check fixes things up after itself */
   1155	slab_pad_check(s, slab);
   1156	return 1;
   1157}
   1158
   1159/*
   1160 * Determine if a certain object in a slab is on the freelist. Must hold the
   1161 * slab lock to guarantee that the chains are in a consistent state.
   1162 */
   1163static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
   1164{
   1165	int nr = 0;
   1166	void *fp;
   1167	void *object = NULL;
   1168	int max_objects;
   1169
   1170	fp = slab->freelist;
   1171	while (fp && nr <= slab->objects) {
   1172		if (fp == search)
   1173			return 1;
   1174		if (!check_valid_pointer(s, slab, fp)) {
   1175			if (object) {
   1176				object_err(s, slab, object,
   1177					"Freechain corrupt");
   1178				set_freepointer(s, object, NULL);
   1179			} else {
   1180				slab_err(s, slab, "Freepointer corrupt");
   1181				slab->freelist = NULL;
   1182				slab->inuse = slab->objects;
   1183				slab_fix(s, "Freelist cleared");
   1184				return 0;
   1185			}
   1186			break;
   1187		}
   1188		object = fp;
   1189		fp = get_freepointer(s, object);
   1190		nr++;
   1191	}
   1192
   1193	max_objects = order_objects(slab_order(slab), s->size);
   1194	if (max_objects > MAX_OBJS_PER_PAGE)
   1195		max_objects = MAX_OBJS_PER_PAGE;
   1196
   1197	if (slab->objects != max_objects) {
   1198		slab_err(s, slab, "Wrong number of objects. Found %d but should be %d",
   1199			 slab->objects, max_objects);
   1200		slab->objects = max_objects;
   1201		slab_fix(s, "Number of objects adjusted");
   1202	}
   1203	if (slab->inuse != slab->objects - nr) {
   1204		slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d",
   1205			 slab->inuse, slab->objects - nr);
   1206		slab->inuse = slab->objects - nr;
   1207		slab_fix(s, "Object count adjusted");
   1208	}
   1209	return search == NULL;
   1210}
   1211
   1212static void trace(struct kmem_cache *s, struct slab *slab, void *object,
   1213								int alloc)
   1214{
   1215	if (s->flags & SLAB_TRACE) {
   1216		pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
   1217			s->name,
   1218			alloc ? "alloc" : "free",
   1219			object, slab->inuse,
   1220			slab->freelist);
   1221
   1222		if (!alloc)
   1223			print_section(KERN_INFO, "Object ", (void *)object,
   1224					s->object_size);
   1225
   1226		dump_stack();
   1227	}
   1228}
   1229
   1230/*
   1231 * Tracking of fully allocated slabs for debugging purposes.
   1232 */
   1233static void add_full(struct kmem_cache *s,
   1234	struct kmem_cache_node *n, struct slab *slab)
   1235{
   1236	if (!(s->flags & SLAB_STORE_USER))
   1237		return;
   1238
   1239	lockdep_assert_held(&n->list_lock);
   1240	list_add(&slab->slab_list, &n->full);
   1241}
   1242
   1243static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
   1244{
   1245	if (!(s->flags & SLAB_STORE_USER))
   1246		return;
   1247
   1248	lockdep_assert_held(&n->list_lock);
   1249	list_del(&slab->slab_list);
   1250}
   1251
   1252/* Tracking of the number of slabs for debugging purposes */
   1253static inline unsigned long slabs_node(struct kmem_cache *s, int node)
   1254{
   1255	struct kmem_cache_node *n = get_node(s, node);
   1256
   1257	return atomic_long_read(&n->nr_slabs);
   1258}
   1259
   1260static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
   1261{
   1262	return atomic_long_read(&n->nr_slabs);
   1263}
   1264
   1265static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
   1266{
   1267	struct kmem_cache_node *n = get_node(s, node);
   1268
   1269	/*
   1270	 * May be called early in order to allocate a slab for the
   1271	 * kmem_cache_node structure. Solve the chicken-egg
   1272	 * dilemma by deferring the increment of the count during
   1273	 * bootstrap (see early_kmem_cache_node_alloc).
   1274	 */
   1275	if (likely(n)) {
   1276		atomic_long_inc(&n->nr_slabs);
   1277		atomic_long_add(objects, &n->total_objects);
   1278	}
   1279}
   1280static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
   1281{
   1282	struct kmem_cache_node *n = get_node(s, node);
   1283
   1284	atomic_long_dec(&n->nr_slabs);
   1285	atomic_long_sub(objects, &n->total_objects);
   1286}
   1287
   1288/* Object debug checks for alloc/free paths */
   1289static void setup_object_debug(struct kmem_cache *s, void *object)
   1290{
   1291	if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
   1292		return;
   1293
   1294	init_object(s, object, SLUB_RED_INACTIVE);
   1295	init_tracking(s, object);
   1296}
   1297
   1298static
   1299void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
   1300{
   1301	if (!kmem_cache_debug_flags(s, SLAB_POISON))
   1302		return;
   1303
   1304	metadata_access_enable();
   1305	memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab));
   1306	metadata_access_disable();
   1307}
   1308
   1309static inline int alloc_consistency_checks(struct kmem_cache *s,
   1310					struct slab *slab, void *object)
   1311{
   1312	if (!check_slab(s, slab))
   1313		return 0;
   1314
   1315	if (!check_valid_pointer(s, slab, object)) {
   1316		object_err(s, slab, object, "Freelist Pointer check fails");
   1317		return 0;
   1318	}
   1319
   1320	if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
   1321		return 0;
   1322
   1323	return 1;
   1324}
   1325
   1326static noinline int alloc_debug_processing(struct kmem_cache *s,
   1327					struct slab *slab,
   1328					void *object, unsigned long addr)
   1329{
   1330	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
   1331		if (!alloc_consistency_checks(s, slab, object))
   1332			goto bad;
   1333	}
   1334
   1335	/* Success perform special debug activities for allocs */
   1336	if (s->flags & SLAB_STORE_USER)
   1337		set_track(s, object, TRACK_ALLOC, addr);
   1338	trace(s, slab, object, 1);
   1339	init_object(s, object, SLUB_RED_ACTIVE);
   1340	return 1;
   1341
   1342bad:
   1343	if (folio_test_slab(slab_folio(slab))) {
   1344		/*
   1345		 * If this is a slab page then lets do the best we can
   1346		 * to avoid issues in the future. Marking all objects
   1347		 * as used avoids touching the remaining objects.
   1348		 */
   1349		slab_fix(s, "Marking all objects used");
   1350		slab->inuse = slab->objects;
   1351		slab->freelist = NULL;
   1352	}
   1353	return 0;
   1354}
   1355
   1356static inline int free_consistency_checks(struct kmem_cache *s,
   1357		struct slab *slab, void *object, unsigned long addr)
   1358{
   1359	if (!check_valid_pointer(s, slab, object)) {
   1360		slab_err(s, slab, "Invalid object pointer 0x%p", object);
   1361		return 0;
   1362	}
   1363
   1364	if (on_freelist(s, slab, object)) {
   1365		object_err(s, slab, object, "Object already free");
   1366		return 0;
   1367	}
   1368
   1369	if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
   1370		return 0;
   1371
   1372	if (unlikely(s != slab->slab_cache)) {
   1373		if (!folio_test_slab(slab_folio(slab))) {
   1374			slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
   1375				 object);
   1376		} else if (!slab->slab_cache) {
   1377			pr_err("SLUB <none>: no slab for object 0x%p.\n",
   1378			       object);
   1379			dump_stack();
   1380		} else
   1381			object_err(s, slab, object,
   1382					"page slab pointer corrupt.");
   1383		return 0;
   1384	}
   1385	return 1;
   1386}
   1387
   1388/* Supports checking bulk free of a constructed freelist */
   1389static noinline int free_debug_processing(
   1390	struct kmem_cache *s, struct slab *slab,
   1391	void *head, void *tail, int bulk_cnt,
   1392	unsigned long addr)
   1393{
   1394	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
   1395	void *object = head;
   1396	int cnt = 0;
   1397	unsigned long flags, flags2;
   1398	int ret = 0;
   1399	depot_stack_handle_t handle = 0;
   1400
   1401	if (s->flags & SLAB_STORE_USER)
   1402		handle = set_track_prepare();
   1403
   1404	spin_lock_irqsave(&n->list_lock, flags);
   1405	slab_lock(slab, &flags2);
   1406
   1407	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
   1408		if (!check_slab(s, slab))
   1409			goto out;
   1410	}
   1411
   1412next_object:
   1413	cnt++;
   1414
   1415	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
   1416		if (!free_consistency_checks(s, slab, object, addr))
   1417			goto out;
   1418	}
   1419
   1420	if (s->flags & SLAB_STORE_USER)
   1421		set_track_update(s, object, TRACK_FREE, addr, handle);
   1422	trace(s, slab, object, 0);
   1423	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
   1424	init_object(s, object, SLUB_RED_INACTIVE);
   1425
   1426	/* Reached end of constructed freelist yet? */
   1427	if (object != tail) {
   1428		object = get_freepointer(s, object);
   1429		goto next_object;
   1430	}
   1431	ret = 1;
   1432
   1433out:
   1434	if (cnt != bulk_cnt)
   1435		slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
   1436			 bulk_cnt, cnt);
   1437
   1438	slab_unlock(slab, &flags2);
   1439	spin_unlock_irqrestore(&n->list_lock, flags);
   1440	if (!ret)
   1441		slab_fix(s, "Object at 0x%p not freed", object);
   1442	return ret;
   1443}
   1444
   1445/*
   1446 * Parse a block of slub_debug options. Blocks are delimited by ';'
   1447 *
   1448 * @str:    start of block
   1449 * @flags:  returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
   1450 * @slabs:  return start of list of slabs, or NULL when there's no list
   1451 * @init:   assume this is initial parsing and not per-kmem-create parsing
   1452 *
   1453 * returns the start of next block if there's any, or NULL
   1454 */
   1455static char *
   1456parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
   1457{
   1458	bool higher_order_disable = false;
   1459
   1460	/* Skip any completely empty blocks */
   1461	while (*str && *str == ';')
   1462		str++;
   1463
   1464	if (*str == ',') {
   1465		/*
   1466		 * No options but restriction on slabs. This means full
   1467		 * debugging for slabs matching a pattern.
   1468		 */
   1469		*flags = DEBUG_DEFAULT_FLAGS;
   1470		goto check_slabs;
   1471	}
   1472	*flags = 0;
   1473
   1474	/* Determine which debug features should be switched on */
   1475	for (; *str && *str != ',' && *str != ';'; str++) {
   1476		switch (tolower(*str)) {
   1477		case '-':
   1478			*flags = 0;
   1479			break;
   1480		case 'f':
   1481			*flags |= SLAB_CONSISTENCY_CHECKS;
   1482			break;
   1483		case 'z':
   1484			*flags |= SLAB_RED_ZONE;
   1485			break;
   1486		case 'p':
   1487			*flags |= SLAB_POISON;
   1488			break;
   1489		case 'u':
   1490			*flags |= SLAB_STORE_USER;
   1491			break;
   1492		case 't':
   1493			*flags |= SLAB_TRACE;
   1494			break;
   1495		case 'a':
   1496			*flags |= SLAB_FAILSLAB;
   1497			break;
   1498		case 'o':
   1499			/*
   1500			 * Avoid enabling debugging on caches if its minimum
   1501			 * order would increase as a result.
   1502			 */
   1503			higher_order_disable = true;
   1504			break;
   1505		default:
   1506			if (init)
   1507				pr_err("slub_debug option '%c' unknown. skipped\n", *str);
   1508		}
   1509	}
   1510check_slabs:
   1511	if (*str == ',')
   1512		*slabs = ++str;
   1513	else
   1514		*slabs = NULL;
   1515
   1516	/* Skip over the slab list */
   1517	while (*str && *str != ';')
   1518		str++;
   1519
   1520	/* Skip any completely empty blocks */
   1521	while (*str && *str == ';')
   1522		str++;
   1523
   1524	if (init && higher_order_disable)
   1525		disable_higher_order_debug = 1;
   1526
   1527	if (*str)
   1528		return str;
   1529	else
   1530		return NULL;
   1531}
   1532
   1533static int __init setup_slub_debug(char *str)
   1534{
   1535	slab_flags_t flags;
   1536	slab_flags_t global_flags;
   1537	char *saved_str;
   1538	char *slab_list;
   1539	bool global_slub_debug_changed = false;
   1540	bool slab_list_specified = false;
   1541
   1542	global_flags = DEBUG_DEFAULT_FLAGS;
   1543	if (*str++ != '=' || !*str)
   1544		/*
   1545		 * No options specified. Switch on full debugging.
   1546		 */
   1547		goto out;
   1548
   1549	saved_str = str;
   1550	while (str) {
   1551		str = parse_slub_debug_flags(str, &flags, &slab_list, true);
   1552
   1553		if (!slab_list) {
   1554			global_flags = flags;
   1555			global_slub_debug_changed = true;
   1556		} else {
   1557			slab_list_specified = true;
   1558			if (flags & SLAB_STORE_USER)
   1559				stack_depot_want_early_init();
   1560		}
   1561	}
   1562
   1563	/*
   1564	 * For backwards compatibility, a single list of flags with list of
   1565	 * slabs means debugging is only changed for those slabs, so the global
   1566	 * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
   1567	 * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
   1568	 * long as there is no option specifying flags without a slab list.
   1569	 */
   1570	if (slab_list_specified) {
   1571		if (!global_slub_debug_changed)
   1572			global_flags = slub_debug;
   1573		slub_debug_string = saved_str;
   1574	}
   1575out:
   1576	slub_debug = global_flags;
   1577	if (slub_debug & SLAB_STORE_USER)
   1578		stack_depot_want_early_init();
   1579	if (slub_debug != 0 || slub_debug_string)
   1580		static_branch_enable(&slub_debug_enabled);
   1581	else
   1582		static_branch_disable(&slub_debug_enabled);
   1583	if ((static_branch_unlikely(&init_on_alloc) ||
   1584	     static_branch_unlikely(&init_on_free)) &&
   1585	    (slub_debug & SLAB_POISON))
   1586		pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
   1587	return 1;
   1588}
   1589
   1590__setup("slub_debug", setup_slub_debug);
   1591
   1592/*
   1593 * kmem_cache_flags - apply debugging options to the cache
   1594 * @object_size:	the size of an object without meta data
   1595 * @flags:		flags to set
   1596 * @name:		name of the cache
   1597 *
   1598 * Debug option(s) are applied to @flags. In addition to the debug
   1599 * option(s), if a slab name (or multiple) is specified i.e.
   1600 * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
   1601 * then only the select slabs will receive the debug option(s).
   1602 */
   1603slab_flags_t kmem_cache_flags(unsigned int object_size,
   1604	slab_flags_t flags, const char *name)
   1605{
   1606	char *iter;
   1607	size_t len;
   1608	char *next_block;
   1609	slab_flags_t block_flags;
   1610	slab_flags_t slub_debug_local = slub_debug;
   1611
   1612	if (flags & SLAB_NO_USER_FLAGS)
   1613		return flags;
   1614
   1615	/*
   1616	 * If the slab cache is for debugging (e.g. kmemleak) then
   1617	 * don't store user (stack trace) information by default,
   1618	 * but let the user enable it via the command line below.
   1619	 */
   1620	if (flags & SLAB_NOLEAKTRACE)
   1621		slub_debug_local &= ~SLAB_STORE_USER;
   1622
   1623	len = strlen(name);
   1624	next_block = slub_debug_string;
   1625	/* Go through all blocks of debug options, see if any matches our slab's name */
   1626	while (next_block) {
   1627		next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
   1628		if (!iter)
   1629			continue;
   1630		/* Found a block that has a slab list, search it */
   1631		while (*iter) {
   1632			char *end, *glob;
   1633			size_t cmplen;
   1634
   1635			end = strchrnul(iter, ',');
   1636			if (next_block && next_block < end)
   1637				end = next_block - 1;
   1638
   1639			glob = strnchr(iter, end - iter, '*');
   1640			if (glob)
   1641				cmplen = glob - iter;
   1642			else
   1643				cmplen = max_t(size_t, len, (end - iter));
   1644
   1645			if (!strncmp(name, iter, cmplen)) {
   1646				flags |= block_flags;
   1647				return flags;
   1648			}
   1649
   1650			if (!*end || *end == ';')
   1651				break;
   1652			iter = end + 1;
   1653		}
   1654	}
   1655
   1656	return flags | slub_debug_local;
   1657}
   1658#else /* !CONFIG_SLUB_DEBUG */
   1659static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
   1660static inline
   1661void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
   1662
   1663static inline int alloc_debug_processing(struct kmem_cache *s,
   1664	struct slab *slab, void *object, unsigned long addr) { return 0; }
   1665
   1666static inline int free_debug_processing(
   1667	struct kmem_cache *s, struct slab *slab,
   1668	void *head, void *tail, int bulk_cnt,
   1669	unsigned long addr) { return 0; }
   1670
   1671static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
   1672static inline int check_object(struct kmem_cache *s, struct slab *slab,
   1673			void *object, u8 val) { return 1; }
   1674static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
   1675					struct slab *slab) {}
   1676static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
   1677					struct slab *slab) {}
   1678slab_flags_t kmem_cache_flags(unsigned int object_size,
   1679	slab_flags_t flags, const char *name)
   1680{
   1681	return flags;
   1682}
   1683#define slub_debug 0
   1684
   1685#define disable_higher_order_debug 0
   1686
   1687static inline unsigned long slabs_node(struct kmem_cache *s, int node)
   1688							{ return 0; }
   1689static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
   1690							{ return 0; }
   1691static inline void inc_slabs_node(struct kmem_cache *s, int node,
   1692							int objects) {}
   1693static inline void dec_slabs_node(struct kmem_cache *s, int node,
   1694							int objects) {}
   1695
   1696static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
   1697			       void **freelist, void *nextfree)
   1698{
   1699	return false;
   1700}
   1701#endif /* CONFIG_SLUB_DEBUG */
   1702
   1703/*
   1704 * Hooks for other subsystems that check memory allocations. In a typical
   1705 * production configuration these hooks all should produce no code at all.
   1706 */
   1707static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
   1708{
   1709	ptr = kasan_kmalloc_large(ptr, size, flags);
   1710	/* As ptr might get tagged, call kmemleak hook after KASAN. */
   1711	kmemleak_alloc(ptr, size, 1, flags);
   1712	return ptr;
   1713}
   1714
   1715static __always_inline void kfree_hook(void *x)
   1716{
   1717	kmemleak_free(x);
   1718	kasan_kfree_large(x);
   1719}
   1720
   1721static __always_inline bool slab_free_hook(struct kmem_cache *s,
   1722						void *x, bool init)
   1723{
   1724	kmemleak_free_recursive(x, s->flags);
   1725
   1726	debug_check_no_locks_freed(x, s->object_size);
   1727
   1728	if (!(s->flags & SLAB_DEBUG_OBJECTS))
   1729		debug_check_no_obj_freed(x, s->object_size);
   1730
   1731	/* Use KCSAN to help debug racy use-after-free. */
   1732	if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
   1733		__kcsan_check_access(x, s->object_size,
   1734				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
   1735
   1736	/*
   1737	 * As memory initialization might be integrated into KASAN,
   1738	 * kasan_slab_free and initialization memset's must be
   1739	 * kept together to avoid discrepancies in behavior.
   1740	 *
   1741	 * The initialization memset's clear the object and the metadata,
   1742	 * but don't touch the SLAB redzone.
   1743	 */
   1744	if (init) {
   1745		int rsize;
   1746
   1747		if (!kasan_has_integrated_init())
   1748			memset(kasan_reset_tag(x), 0, s->object_size);
   1749		rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
   1750		memset((char *)kasan_reset_tag(x) + s->inuse, 0,
   1751		       s->size - s->inuse - rsize);
   1752	}
   1753	/* KASAN might put x into memory quarantine, delaying its reuse. */
   1754	return kasan_slab_free(s, x, init);
   1755}
   1756
   1757static inline bool slab_free_freelist_hook(struct kmem_cache *s,
   1758					   void **head, void **tail,
   1759					   int *cnt)
   1760{
   1761
   1762	void *object;
   1763	void *next = *head;
   1764	void *old_tail = *tail ? *tail : *head;
   1765
   1766	if (is_kfence_address(next)) {
   1767		slab_free_hook(s, next, false);
   1768		return true;
   1769	}
   1770
   1771	/* Head and tail of the reconstructed freelist */
   1772	*head = NULL;
   1773	*tail = NULL;
   1774
   1775	do {
   1776		object = next;
   1777		next = get_freepointer(s, object);
   1778
   1779		/* If object's reuse doesn't have to be delayed */
   1780		if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
   1781			/* Move object to the new freelist */
   1782			set_freepointer(s, object, *head);
   1783			*head = object;
   1784			if (!*tail)
   1785				*tail = object;
   1786		} else {
   1787			/*
   1788			 * Adjust the reconstructed freelist depth
   1789			 * accordingly if object's reuse is delayed.
   1790			 */
   1791			--(*cnt);
   1792		}
   1793	} while (object != old_tail);
   1794
   1795	if (*head == *tail)
   1796		*tail = NULL;
   1797
   1798	return *head != NULL;
   1799}
   1800
   1801static void *setup_object(struct kmem_cache *s, void *object)
   1802{
   1803	setup_object_debug(s, object);
   1804	object = kasan_init_slab_obj(s, object);
   1805	if (unlikely(s->ctor)) {
   1806		kasan_unpoison_object_data(s, object);
   1807		s->ctor(object);
   1808		kasan_poison_object_data(s, object);
   1809	}
   1810	return object;
   1811}
   1812
   1813/*
   1814 * Slab allocation and freeing
   1815 */
   1816static inline struct slab *alloc_slab_page(gfp_t flags, int node,
   1817		struct kmem_cache_order_objects oo)
   1818{
   1819	struct folio *folio;
   1820	struct slab *slab;
   1821	unsigned int order = oo_order(oo);
   1822
   1823	if (node == NUMA_NO_NODE)
   1824		folio = (struct folio *)alloc_pages(flags, order);
   1825	else
   1826		folio = (struct folio *)__alloc_pages_node(node, flags, order);
   1827
   1828	if (!folio)
   1829		return NULL;
   1830
   1831	slab = folio_slab(folio);
   1832	__folio_set_slab(folio);
   1833	if (page_is_pfmemalloc(folio_page(folio, 0)))
   1834		slab_set_pfmemalloc(slab);
   1835
   1836	return slab;
   1837}
   1838
   1839#ifdef CONFIG_SLAB_FREELIST_RANDOM
   1840/* Pre-initialize the random sequence cache */
   1841static int init_cache_random_seq(struct kmem_cache *s)
   1842{
   1843	unsigned int count = oo_objects(s->oo);
   1844	int err;
   1845
   1846	/* Bailout if already initialised */
   1847	if (s->random_seq)
   1848		return 0;
   1849
   1850	err = cache_random_seq_create(s, count, GFP_KERNEL);
   1851	if (err) {
   1852		pr_err("SLUB: Unable to initialize free list for %s\n",
   1853			s->name);
   1854		return err;
   1855	}
   1856
   1857	/* Transform to an offset on the set of pages */
   1858	if (s->random_seq) {
   1859		unsigned int i;
   1860
   1861		for (i = 0; i < count; i++)
   1862			s->random_seq[i] *= s->size;
   1863	}
   1864	return 0;
   1865}
   1866
   1867/* Initialize each random sequence freelist per cache */
   1868static void __init init_freelist_randomization(void)
   1869{
   1870	struct kmem_cache *s;
   1871
   1872	mutex_lock(&slab_mutex);
   1873
   1874	list_for_each_entry(s, &slab_caches, list)
   1875		init_cache_random_seq(s);
   1876
   1877	mutex_unlock(&slab_mutex);
   1878}
   1879
   1880/* Get the next entry on the pre-computed freelist randomized */
   1881static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab,
   1882				unsigned long *pos, void *start,
   1883				unsigned long page_limit,
   1884				unsigned long freelist_count)
   1885{
   1886	unsigned int idx;
   1887
   1888	/*
   1889	 * If the target page allocation failed, the number of objects on the
   1890	 * page might be smaller than the usual size defined by the cache.
   1891	 */
   1892	do {
   1893		idx = s->random_seq[*pos];
   1894		*pos += 1;
   1895		if (*pos >= freelist_count)
   1896			*pos = 0;
   1897	} while (unlikely(idx >= page_limit));
   1898
   1899	return (char *)start + idx;
   1900}
   1901
   1902/* Shuffle the single linked freelist based on a random pre-computed sequence */
   1903static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
   1904{
   1905	void *start;
   1906	void *cur;
   1907	void *next;
   1908	unsigned long idx, pos, page_limit, freelist_count;
   1909
   1910	if (slab->objects < 2 || !s->random_seq)
   1911		return false;
   1912
   1913	freelist_count = oo_objects(s->oo);
   1914	pos = get_random_int() % freelist_count;
   1915
   1916	page_limit = slab->objects * s->size;
   1917	start = fixup_red_left(s, slab_address(slab));
   1918
   1919	/* First entry is used as the base of the freelist */
   1920	cur = next_freelist_entry(s, slab, &pos, start, page_limit,
   1921				freelist_count);
   1922	cur = setup_object(s, cur);
   1923	slab->freelist = cur;
   1924
   1925	for (idx = 1; idx < slab->objects; idx++) {
   1926		next = next_freelist_entry(s, slab, &pos, start, page_limit,
   1927			freelist_count);
   1928		next = setup_object(s, next);
   1929		set_freepointer(s, cur, next);
   1930		cur = next;
   1931	}
   1932	set_freepointer(s, cur, NULL);
   1933
   1934	return true;
   1935}
   1936#else
   1937static inline int init_cache_random_seq(struct kmem_cache *s)
   1938{
   1939	return 0;
   1940}
   1941static inline void init_freelist_randomization(void) { }
   1942static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
   1943{
   1944	return false;
   1945}
   1946#endif /* CONFIG_SLAB_FREELIST_RANDOM */
   1947
   1948static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
   1949{
   1950	struct slab *slab;
   1951	struct kmem_cache_order_objects oo = s->oo;
   1952	gfp_t alloc_gfp;
   1953	void *start, *p, *next;
   1954	int idx;
   1955	bool shuffle;
   1956
   1957	flags &= gfp_allowed_mask;
   1958
   1959	flags |= s->allocflags;
   1960
   1961	/*
   1962	 * Let the initial higher-order allocation fail under memory pressure
   1963	 * so we fall-back to the minimum order allocation.
   1964	 */
   1965	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
   1966	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
   1967		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
   1968
   1969	slab = alloc_slab_page(alloc_gfp, node, oo);
   1970	if (unlikely(!slab)) {
   1971		oo = s->min;
   1972		alloc_gfp = flags;
   1973		/*
   1974		 * Allocation may have failed due to fragmentation.
   1975		 * Try a lower order alloc if possible
   1976		 */
   1977		slab = alloc_slab_page(alloc_gfp, node, oo);
   1978		if (unlikely(!slab))
   1979			goto out;
   1980		stat(s, ORDER_FALLBACK);
   1981	}
   1982
   1983	slab->objects = oo_objects(oo);
   1984
   1985	account_slab(slab, oo_order(oo), s, flags);
   1986
   1987	slab->slab_cache = s;
   1988
   1989	kasan_poison_slab(slab);
   1990
   1991	start = slab_address(slab);
   1992
   1993	setup_slab_debug(s, slab, start);
   1994
   1995	shuffle = shuffle_freelist(s, slab);
   1996
   1997	if (!shuffle) {
   1998		start = fixup_red_left(s, start);
   1999		start = setup_object(s, start);
   2000		slab->freelist = start;
   2001		for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
   2002			next = p + s->size;
   2003			next = setup_object(s, next);
   2004			set_freepointer(s, p, next);
   2005			p = next;
   2006		}
   2007		set_freepointer(s, p, NULL);
   2008	}
   2009
   2010	slab->inuse = slab->objects;
   2011	slab->frozen = 1;
   2012
   2013out:
   2014	if (!slab)
   2015		return NULL;
   2016
   2017	inc_slabs_node(s, slab_nid(slab), slab->objects);
   2018
   2019	return slab;
   2020}
   2021
   2022static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
   2023{
   2024	if (unlikely(flags & GFP_SLAB_BUG_MASK))
   2025		flags = kmalloc_fix_flags(flags);
   2026
   2027	WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
   2028
   2029	return allocate_slab(s,
   2030		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
   2031}
   2032
   2033static void __free_slab(struct kmem_cache *s, struct slab *slab)
   2034{
   2035	struct folio *folio = slab_folio(slab);
   2036	int order = folio_order(folio);
   2037	int pages = 1 << order;
   2038
   2039	if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
   2040		void *p;
   2041
   2042		slab_pad_check(s, slab);
   2043		for_each_object(p, s, slab_address(slab), slab->objects)
   2044			check_object(s, slab, p, SLUB_RED_INACTIVE);
   2045	}
   2046
   2047	__slab_clear_pfmemalloc(slab);
   2048	__folio_clear_slab(folio);
   2049	folio->mapping = NULL;
   2050	if (current->reclaim_state)
   2051		current->reclaim_state->reclaimed_slab += pages;
   2052	unaccount_slab(slab, order, s);
   2053	__free_pages(folio_page(folio, 0), order);
   2054}
   2055
   2056static void rcu_free_slab(struct rcu_head *h)
   2057{
   2058	struct slab *slab = container_of(h, struct slab, rcu_head);
   2059
   2060	__free_slab(slab->slab_cache, slab);
   2061}
   2062
   2063static void free_slab(struct kmem_cache *s, struct slab *slab)
   2064{
   2065	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
   2066		call_rcu(&slab->rcu_head, rcu_free_slab);
   2067	} else
   2068		__free_slab(s, slab);
   2069}
   2070
   2071static void discard_slab(struct kmem_cache *s, struct slab *slab)
   2072{
   2073	dec_slabs_node(s, slab_nid(slab), slab->objects);
   2074	free_slab(s, slab);
   2075}
   2076
   2077/*
   2078 * Management of partially allocated slabs.
   2079 */
   2080static inline void
   2081__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
   2082{
   2083	n->nr_partial++;
   2084	if (tail == DEACTIVATE_TO_TAIL)
   2085		list_add_tail(&slab->slab_list, &n->partial);
   2086	else
   2087		list_add(&slab->slab_list, &n->partial);
   2088}
   2089
   2090static inline void add_partial(struct kmem_cache_node *n,
   2091				struct slab *slab, int tail)
   2092{
   2093	lockdep_assert_held(&n->list_lock);
   2094	__add_partial(n, slab, tail);
   2095}
   2096
   2097static inline void remove_partial(struct kmem_cache_node *n,
   2098					struct slab *slab)
   2099{
   2100	lockdep_assert_held(&n->list_lock);
   2101	list_del(&slab->slab_list);
   2102	n->nr_partial--;
   2103}
   2104
   2105/*
   2106 * Remove slab from the partial list, freeze it and
   2107 * return the pointer to the freelist.
   2108 *
   2109 * Returns a list of objects or NULL if it fails.
   2110 */
   2111static inline void *acquire_slab(struct kmem_cache *s,
   2112		struct kmem_cache_node *n, struct slab *slab,
   2113		int mode)
   2114{
   2115	void *freelist;
   2116	unsigned long counters;
   2117	struct slab new;
   2118
   2119	lockdep_assert_held(&n->list_lock);
   2120
   2121	/*
   2122	 * Zap the freelist and set the frozen bit.
   2123	 * The old freelist is the list of objects for the
   2124	 * per cpu allocation list.
   2125	 */
   2126	freelist = slab->freelist;
   2127	counters = slab->counters;
   2128	new.counters = counters;
   2129	if (mode) {
   2130		new.inuse = slab->objects;
   2131		new.freelist = NULL;
   2132	} else {
   2133		new.freelist = freelist;
   2134	}
   2135
   2136	VM_BUG_ON(new.frozen);
   2137	new.frozen = 1;
   2138
   2139	if (!__cmpxchg_double_slab(s, slab,
   2140			freelist, counters,
   2141			new.freelist, new.counters,
   2142			"acquire_slab"))
   2143		return NULL;
   2144
   2145	remove_partial(n, slab);
   2146	WARN_ON(!freelist);
   2147	return freelist;
   2148}
   2149
   2150#ifdef CONFIG_SLUB_CPU_PARTIAL
   2151static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
   2152#else
   2153static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
   2154				   int drain) { }
   2155#endif
   2156static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
   2157
   2158/*
   2159 * Try to allocate a partial slab from a specific node.
   2160 */
   2161static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
   2162			      struct slab **ret_slab, gfp_t gfpflags)
   2163{
   2164	struct slab *slab, *slab2;
   2165	void *object = NULL;
   2166	unsigned long flags;
   2167	unsigned int partial_slabs = 0;
   2168
   2169	/*
   2170	 * Racy check. If we mistakenly see no partial slabs then we
   2171	 * just allocate an empty slab. If we mistakenly try to get a
   2172	 * partial slab and there is none available then get_partial()
   2173	 * will return NULL.
   2174	 */
   2175	if (!n || !n->nr_partial)
   2176		return NULL;
   2177
   2178	spin_lock_irqsave(&n->list_lock, flags);
   2179	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
   2180		void *t;
   2181
   2182		if (!pfmemalloc_match(slab, gfpflags))
   2183			continue;
   2184
   2185		t = acquire_slab(s, n, slab, object == NULL);
   2186		if (!t)
   2187			break;
   2188
   2189		if (!object) {
   2190			*ret_slab = slab;
   2191			stat(s, ALLOC_FROM_PARTIAL);
   2192			object = t;
   2193		} else {
   2194			put_cpu_partial(s, slab, 0);
   2195			stat(s, CPU_PARTIAL_NODE);
   2196			partial_slabs++;
   2197		}
   2198#ifdef CONFIG_SLUB_CPU_PARTIAL
   2199		if (!kmem_cache_has_cpu_partial(s)
   2200			|| partial_slabs > s->cpu_partial_slabs / 2)
   2201			break;
   2202#else
   2203		break;
   2204#endif
   2205
   2206	}
   2207	spin_unlock_irqrestore(&n->list_lock, flags);
   2208	return object;
   2209}
   2210
   2211/*
   2212 * Get a slab from somewhere. Search in increasing NUMA distances.
   2213 */
   2214static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
   2215			     struct slab **ret_slab)
   2216{
   2217#ifdef CONFIG_NUMA
   2218	struct zonelist *zonelist;
   2219	struct zoneref *z;
   2220	struct zone *zone;
   2221	enum zone_type highest_zoneidx = gfp_zone(flags);
   2222	void *object;
   2223	unsigned int cpuset_mems_cookie;
   2224
   2225	/*
   2226	 * The defrag ratio allows a configuration of the tradeoffs between
   2227	 * inter node defragmentation and node local allocations. A lower
   2228	 * defrag_ratio increases the tendency to do local allocations
   2229	 * instead of attempting to obtain partial slabs from other nodes.
   2230	 *
   2231	 * If the defrag_ratio is set to 0 then kmalloc() always
   2232	 * returns node local objects. If the ratio is higher then kmalloc()
   2233	 * may return off node objects because partial slabs are obtained
   2234	 * from other nodes and filled up.
   2235	 *
   2236	 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
   2237	 * (which makes defrag_ratio = 1000) then every (well almost)
   2238	 * allocation will first attempt to defrag slab caches on other nodes.
   2239	 * This means scanning over all nodes to look for partial slabs which
   2240	 * may be expensive if we do it every time we are trying to find a slab
   2241	 * with available objects.
   2242	 */
   2243	if (!s->remote_node_defrag_ratio ||
   2244			get_cycles() % 1024 > s->remote_node_defrag_ratio)
   2245		return NULL;
   2246
   2247	do {
   2248		cpuset_mems_cookie = read_mems_allowed_begin();
   2249		zonelist = node_zonelist(mempolicy_slab_node(), flags);
   2250		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
   2251			struct kmem_cache_node *n;
   2252
   2253			n = get_node(s, zone_to_nid(zone));
   2254
   2255			if (n && cpuset_zone_allowed(zone, flags) &&
   2256					n->nr_partial > s->min_partial) {
   2257				object = get_partial_node(s, n, ret_slab, flags);
   2258				if (object) {
   2259					/*
   2260					 * Don't check read_mems_allowed_retry()
   2261					 * here - if mems_allowed was updated in
   2262					 * parallel, that was a harmless race
   2263					 * between allocation and the cpuset
   2264					 * update
   2265					 */
   2266					return object;
   2267				}
   2268			}
   2269		}
   2270	} while (read_mems_allowed_retry(cpuset_mems_cookie));
   2271#endif	/* CONFIG_NUMA */
   2272	return NULL;
   2273}
   2274
   2275/*
   2276 * Get a partial slab, lock it and return it.
   2277 */
   2278static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
   2279			 struct slab **ret_slab)
   2280{
   2281	void *object;
   2282	int searchnode = node;
   2283
   2284	if (node == NUMA_NO_NODE)
   2285		searchnode = numa_mem_id();
   2286
   2287	object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags);
   2288	if (object || node != NUMA_NO_NODE)
   2289		return object;
   2290
   2291	return get_any_partial(s, flags, ret_slab);
   2292}
   2293
   2294#ifdef CONFIG_PREEMPTION
   2295/*
   2296 * Calculate the next globally unique transaction for disambiguation
   2297 * during cmpxchg. The transactions start with the cpu number and are then
   2298 * incremented by CONFIG_NR_CPUS.
   2299 */
   2300#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
   2301#else
   2302/*
   2303 * No preemption supported therefore also no need to check for
   2304 * different cpus.
   2305 */
   2306#define TID_STEP 1
   2307#endif
   2308
   2309static inline unsigned long next_tid(unsigned long tid)
   2310{
   2311	return tid + TID_STEP;
   2312}
   2313
   2314#ifdef SLUB_DEBUG_CMPXCHG
   2315static inline unsigned int tid_to_cpu(unsigned long tid)
   2316{
   2317	return tid % TID_STEP;
   2318}
   2319
   2320static inline unsigned long tid_to_event(unsigned long tid)
   2321{
   2322	return tid / TID_STEP;
   2323}
   2324#endif
   2325
   2326static inline unsigned int init_tid(int cpu)
   2327{
   2328	return cpu;
   2329}
   2330
   2331static inline void note_cmpxchg_failure(const char *n,
   2332		const struct kmem_cache *s, unsigned long tid)
   2333{
   2334#ifdef SLUB_DEBUG_CMPXCHG
   2335	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
   2336
   2337	pr_info("%s %s: cmpxchg redo ", n, s->name);
   2338
   2339#ifdef CONFIG_PREEMPTION
   2340	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
   2341		pr_warn("due to cpu change %d -> %d\n",
   2342			tid_to_cpu(tid), tid_to_cpu(actual_tid));
   2343	else
   2344#endif
   2345	if (tid_to_event(tid) != tid_to_event(actual_tid))
   2346		pr_warn("due to cpu running other code. Event %ld->%ld\n",
   2347			tid_to_event(tid), tid_to_event(actual_tid));
   2348	else
   2349		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
   2350			actual_tid, tid, next_tid(tid));
   2351#endif
   2352	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
   2353}
   2354
   2355static void init_kmem_cache_cpus(struct kmem_cache *s)
   2356{
   2357	int cpu;
   2358	struct kmem_cache_cpu *c;
   2359
   2360	for_each_possible_cpu(cpu) {
   2361		c = per_cpu_ptr(s->cpu_slab, cpu);
   2362		local_lock_init(&c->lock);
   2363		c->tid = init_tid(cpu);
   2364	}
   2365}
   2366
   2367/*
   2368 * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
   2369 * unfreezes the slabs and puts it on the proper list.
   2370 * Assumes the slab has been already safely taken away from kmem_cache_cpu
   2371 * by the caller.
   2372 */
   2373static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
   2374			    void *freelist)
   2375{
   2376	enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST };
   2377	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
   2378	int free_delta = 0;
   2379	enum slab_modes mode = M_NONE;
   2380	void *nextfree, *freelist_iter, *freelist_tail;
   2381	int tail = DEACTIVATE_TO_HEAD;
   2382	unsigned long flags = 0;
   2383	struct slab new;
   2384	struct slab old;
   2385
   2386	if (slab->freelist) {
   2387		stat(s, DEACTIVATE_REMOTE_FREES);
   2388		tail = DEACTIVATE_TO_TAIL;
   2389	}
   2390
   2391	/*
   2392	 * Stage one: Count the objects on cpu's freelist as free_delta and
   2393	 * remember the last object in freelist_tail for later splicing.
   2394	 */
   2395	freelist_tail = NULL;
   2396	freelist_iter = freelist;
   2397	while (freelist_iter) {
   2398		nextfree = get_freepointer(s, freelist_iter);
   2399
   2400		/*
   2401		 * If 'nextfree' is invalid, it is possible that the object at
   2402		 * 'freelist_iter' is already corrupted.  So isolate all objects
   2403		 * starting at 'freelist_iter' by skipping them.
   2404		 */
   2405		if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
   2406			break;
   2407
   2408		freelist_tail = freelist_iter;
   2409		free_delta++;
   2410
   2411		freelist_iter = nextfree;
   2412	}
   2413
   2414	/*
   2415	 * Stage two: Unfreeze the slab while splicing the per-cpu
   2416	 * freelist to the head of slab's freelist.
   2417	 *
   2418	 * Ensure that the slab is unfrozen while the list presence
   2419	 * reflects the actual number of objects during unfreeze.
   2420	 *
   2421	 * We first perform cmpxchg holding lock and insert to list
   2422	 * when it succeed. If there is mismatch then the slab is not
   2423	 * unfrozen and number of objects in the slab may have changed.
   2424	 * Then release lock and retry cmpxchg again.
   2425	 */
   2426redo:
   2427
   2428	old.freelist = READ_ONCE(slab->freelist);
   2429	old.counters = READ_ONCE(slab->counters);
   2430	VM_BUG_ON(!old.frozen);
   2431
   2432	/* Determine target state of the slab */
   2433	new.counters = old.counters;
   2434	if (freelist_tail) {
   2435		new.inuse -= free_delta;
   2436		set_freepointer(s, freelist_tail, old.freelist);
   2437		new.freelist = freelist;
   2438	} else
   2439		new.freelist = old.freelist;
   2440
   2441	new.frozen = 0;
   2442
   2443	if (!new.inuse && n->nr_partial >= s->min_partial) {
   2444		mode = M_FREE;
   2445	} else if (new.freelist) {
   2446		mode = M_PARTIAL;
   2447		/*
   2448		 * Taking the spinlock removes the possibility that
   2449		 * acquire_slab() will see a slab that is frozen
   2450		 */
   2451		spin_lock_irqsave(&n->list_lock, flags);
   2452	} else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) {
   2453		mode = M_FULL;
   2454		/*
   2455		 * This also ensures that the scanning of full
   2456		 * slabs from diagnostic functions will not see
   2457		 * any frozen slabs.
   2458		 */
   2459		spin_lock_irqsave(&n->list_lock, flags);
   2460	} else {
   2461		mode = M_FULL_NOLIST;
   2462	}
   2463
   2464
   2465	if (!cmpxchg_double_slab(s, slab,
   2466				old.freelist, old.counters,
   2467				new.freelist, new.counters,
   2468				"unfreezing slab")) {
   2469		if (mode == M_PARTIAL || mode == M_FULL)
   2470			spin_unlock_irqrestore(&n->list_lock, flags);
   2471		goto redo;
   2472	}
   2473
   2474
   2475	if (mode == M_PARTIAL) {
   2476		add_partial(n, slab, tail);
   2477		spin_unlock_irqrestore(&n->list_lock, flags);
   2478		stat(s, tail);
   2479	} else if (mode == M_FREE) {
   2480		stat(s, DEACTIVATE_EMPTY);
   2481		discard_slab(s, slab);
   2482		stat(s, FREE_SLAB);
   2483	} else if (mode == M_FULL) {
   2484		add_full(s, n, slab);
   2485		spin_unlock_irqrestore(&n->list_lock, flags);
   2486		stat(s, DEACTIVATE_FULL);
   2487	} else if (mode == M_FULL_NOLIST) {
   2488		stat(s, DEACTIVATE_FULL);
   2489	}
   2490}
   2491
   2492#ifdef CONFIG_SLUB_CPU_PARTIAL
   2493static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
   2494{
   2495	struct kmem_cache_node *n = NULL, *n2 = NULL;
   2496	struct slab *slab, *slab_to_discard = NULL;
   2497	unsigned long flags = 0;
   2498
   2499	while (partial_slab) {
   2500		struct slab new;
   2501		struct slab old;
   2502
   2503		slab = partial_slab;
   2504		partial_slab = slab->next;
   2505
   2506		n2 = get_node(s, slab_nid(slab));
   2507		if (n != n2) {
   2508			if (n)
   2509				spin_unlock_irqrestore(&n->list_lock, flags);
   2510
   2511			n = n2;
   2512			spin_lock_irqsave(&n->list_lock, flags);
   2513		}
   2514
   2515		do {
   2516
   2517			old.freelist = slab->freelist;
   2518			old.counters = slab->counters;
   2519			VM_BUG_ON(!old.frozen);
   2520
   2521			new.counters = old.counters;
   2522			new.freelist = old.freelist;
   2523
   2524			new.frozen = 0;
   2525
   2526		} while (!__cmpxchg_double_slab(s, slab,
   2527				old.freelist, old.counters,
   2528				new.freelist, new.counters,
   2529				"unfreezing slab"));
   2530
   2531		if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
   2532			slab->next = slab_to_discard;
   2533			slab_to_discard = slab;
   2534		} else {
   2535			add_partial(n, slab, DEACTIVATE_TO_TAIL);
   2536			stat(s, FREE_ADD_PARTIAL);
   2537		}
   2538	}
   2539
   2540	if (n)
   2541		spin_unlock_irqrestore(&n->list_lock, flags);
   2542
   2543	while (slab_to_discard) {
   2544		slab = slab_to_discard;
   2545		slab_to_discard = slab_to_discard->next;
   2546
   2547		stat(s, DEACTIVATE_EMPTY);
   2548		discard_slab(s, slab);
   2549		stat(s, FREE_SLAB);
   2550	}
   2551}
   2552
   2553/*
   2554 * Unfreeze all the cpu partial slabs.
   2555 */
   2556static void unfreeze_partials(struct kmem_cache *s)
   2557{
   2558	struct slab *partial_slab;
   2559	unsigned long flags;
   2560
   2561	local_lock_irqsave(&s->cpu_slab->lock, flags);
   2562	partial_slab = this_cpu_read(s->cpu_slab->partial);
   2563	this_cpu_write(s->cpu_slab->partial, NULL);
   2564	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   2565
   2566	if (partial_slab)
   2567		__unfreeze_partials(s, partial_slab);
   2568}
   2569
   2570static void unfreeze_partials_cpu(struct kmem_cache *s,
   2571				  struct kmem_cache_cpu *c)
   2572{
   2573	struct slab *partial_slab;
   2574
   2575	partial_slab = slub_percpu_partial(c);
   2576	c->partial = NULL;
   2577
   2578	if (partial_slab)
   2579		__unfreeze_partials(s, partial_slab);
   2580}
   2581
   2582/*
   2583 * Put a slab that was just frozen (in __slab_free|get_partial_node) into a
   2584 * partial slab slot if available.
   2585 *
   2586 * If we did not find a slot then simply move all the partials to the
   2587 * per node partial list.
   2588 */
   2589static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
   2590{
   2591	struct slab *oldslab;
   2592	struct slab *slab_to_unfreeze = NULL;
   2593	unsigned long flags;
   2594	int slabs = 0;
   2595
   2596	local_lock_irqsave(&s->cpu_slab->lock, flags);
   2597
   2598	oldslab = this_cpu_read(s->cpu_slab->partial);
   2599
   2600	if (oldslab) {
   2601		if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
   2602			/*
   2603			 * Partial array is full. Move the existing set to the
   2604			 * per node partial list. Postpone the actual unfreezing
   2605			 * outside of the critical section.
   2606			 */
   2607			slab_to_unfreeze = oldslab;
   2608			oldslab = NULL;
   2609		} else {
   2610			slabs = oldslab->slabs;
   2611		}
   2612	}
   2613
   2614	slabs++;
   2615
   2616	slab->slabs = slabs;
   2617	slab->next = oldslab;
   2618
   2619	this_cpu_write(s->cpu_slab->partial, slab);
   2620
   2621	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   2622
   2623	if (slab_to_unfreeze) {
   2624		__unfreeze_partials(s, slab_to_unfreeze);
   2625		stat(s, CPU_PARTIAL_DRAIN);
   2626	}
   2627}
   2628
   2629#else	/* CONFIG_SLUB_CPU_PARTIAL */
   2630
   2631static inline void unfreeze_partials(struct kmem_cache *s) { }
   2632static inline void unfreeze_partials_cpu(struct kmem_cache *s,
   2633				  struct kmem_cache_cpu *c) { }
   2634
   2635#endif	/* CONFIG_SLUB_CPU_PARTIAL */
   2636
   2637static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
   2638{
   2639	unsigned long flags;
   2640	struct slab *slab;
   2641	void *freelist;
   2642
   2643	local_lock_irqsave(&s->cpu_slab->lock, flags);
   2644
   2645	slab = c->slab;
   2646	freelist = c->freelist;
   2647
   2648	c->slab = NULL;
   2649	c->freelist = NULL;
   2650	c->tid = next_tid(c->tid);
   2651
   2652	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   2653
   2654	if (slab) {
   2655		deactivate_slab(s, slab, freelist);
   2656		stat(s, CPUSLAB_FLUSH);
   2657	}
   2658}
   2659
   2660static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
   2661{
   2662	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
   2663	void *freelist = c->freelist;
   2664	struct slab *slab = c->slab;
   2665
   2666	c->slab = NULL;
   2667	c->freelist = NULL;
   2668	c->tid = next_tid(c->tid);
   2669
   2670	if (slab) {
   2671		deactivate_slab(s, slab, freelist);
   2672		stat(s, CPUSLAB_FLUSH);
   2673	}
   2674
   2675	unfreeze_partials_cpu(s, c);
   2676}
   2677
   2678struct slub_flush_work {
   2679	struct work_struct work;
   2680	struct kmem_cache *s;
   2681	bool skip;
   2682};
   2683
   2684/*
   2685 * Flush cpu slab.
   2686 *
   2687 * Called from CPU work handler with migration disabled.
   2688 */
   2689static void flush_cpu_slab(struct work_struct *w)
   2690{
   2691	struct kmem_cache *s;
   2692	struct kmem_cache_cpu *c;
   2693	struct slub_flush_work *sfw;
   2694
   2695	sfw = container_of(w, struct slub_flush_work, work);
   2696
   2697	s = sfw->s;
   2698	c = this_cpu_ptr(s->cpu_slab);
   2699
   2700	if (c->slab)
   2701		flush_slab(s, c);
   2702
   2703	unfreeze_partials(s);
   2704}
   2705
   2706static bool has_cpu_slab(int cpu, struct kmem_cache *s)
   2707{
   2708	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
   2709
   2710	return c->slab || slub_percpu_partial(c);
   2711}
   2712
   2713static DEFINE_MUTEX(flush_lock);
   2714static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
   2715
   2716static void flush_all_cpus_locked(struct kmem_cache *s)
   2717{
   2718	struct slub_flush_work *sfw;
   2719	unsigned int cpu;
   2720
   2721	lockdep_assert_cpus_held();
   2722	mutex_lock(&flush_lock);
   2723
   2724	for_each_online_cpu(cpu) {
   2725		sfw = &per_cpu(slub_flush, cpu);
   2726		if (!has_cpu_slab(cpu, s)) {
   2727			sfw->skip = true;
   2728			continue;
   2729		}
   2730		INIT_WORK(&sfw->work, flush_cpu_slab);
   2731		sfw->skip = false;
   2732		sfw->s = s;
   2733		schedule_work_on(cpu, &sfw->work);
   2734	}
   2735
   2736	for_each_online_cpu(cpu) {
   2737		sfw = &per_cpu(slub_flush, cpu);
   2738		if (sfw->skip)
   2739			continue;
   2740		flush_work(&sfw->work);
   2741	}
   2742
   2743	mutex_unlock(&flush_lock);
   2744}
   2745
   2746static void flush_all(struct kmem_cache *s)
   2747{
   2748	cpus_read_lock();
   2749	flush_all_cpus_locked(s);
   2750	cpus_read_unlock();
   2751}
   2752
   2753/*
   2754 * Use the cpu notifier to insure that the cpu slabs are flushed when
   2755 * necessary.
   2756 */
   2757static int slub_cpu_dead(unsigned int cpu)
   2758{
   2759	struct kmem_cache *s;
   2760
   2761	mutex_lock(&slab_mutex);
   2762	list_for_each_entry(s, &slab_caches, list)
   2763		__flush_cpu_slab(s, cpu);
   2764	mutex_unlock(&slab_mutex);
   2765	return 0;
   2766}
   2767
   2768/*
   2769 * Check if the objects in a per cpu structure fit numa
   2770 * locality expectations.
   2771 */
   2772static inline int node_match(struct slab *slab, int node)
   2773{
   2774#ifdef CONFIG_NUMA
   2775	if (node != NUMA_NO_NODE && slab_nid(slab) != node)
   2776		return 0;
   2777#endif
   2778	return 1;
   2779}
   2780
   2781#ifdef CONFIG_SLUB_DEBUG
   2782static int count_free(struct slab *slab)
   2783{
   2784	return slab->objects - slab->inuse;
   2785}
   2786
   2787static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
   2788{
   2789	return atomic_long_read(&n->total_objects);
   2790}
   2791#endif /* CONFIG_SLUB_DEBUG */
   2792
   2793#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
   2794static unsigned long count_partial(struct kmem_cache_node *n,
   2795					int (*get_count)(struct slab *))
   2796{
   2797	unsigned long flags;
   2798	unsigned long x = 0;
   2799	struct slab *slab;
   2800
   2801	spin_lock_irqsave(&n->list_lock, flags);
   2802	list_for_each_entry(slab, &n->partial, slab_list)
   2803		x += get_count(slab);
   2804	spin_unlock_irqrestore(&n->list_lock, flags);
   2805	return x;
   2806}
   2807#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
   2808
   2809static noinline void
   2810slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
   2811{
   2812#ifdef CONFIG_SLUB_DEBUG
   2813	static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
   2814				      DEFAULT_RATELIMIT_BURST);
   2815	int node;
   2816	struct kmem_cache_node *n;
   2817
   2818	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
   2819		return;
   2820
   2821	pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
   2822		nid, gfpflags, &gfpflags);
   2823	pr_warn("  cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
   2824		s->name, s->object_size, s->size, oo_order(s->oo),
   2825		oo_order(s->min));
   2826
   2827	if (oo_order(s->min) > get_order(s->object_size))
   2828		pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
   2829			s->name);
   2830
   2831	for_each_kmem_cache_node(s, node, n) {
   2832		unsigned long nr_slabs;
   2833		unsigned long nr_objs;
   2834		unsigned long nr_free;
   2835
   2836		nr_free  = count_partial(n, count_free);
   2837		nr_slabs = node_nr_slabs(n);
   2838		nr_objs  = node_nr_objs(n);
   2839
   2840		pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
   2841			node, nr_slabs, nr_objs, nr_free);
   2842	}
   2843#endif
   2844}
   2845
   2846static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
   2847{
   2848	if (unlikely(slab_test_pfmemalloc(slab)))
   2849		return gfp_pfmemalloc_allowed(gfpflags);
   2850
   2851	return true;
   2852}
   2853
   2854/*
   2855 * Check the slab->freelist and either transfer the freelist to the
   2856 * per cpu freelist or deactivate the slab.
   2857 *
   2858 * The slab is still frozen if the return value is not NULL.
   2859 *
   2860 * If this function returns NULL then the slab has been unfrozen.
   2861 */
   2862static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
   2863{
   2864	struct slab new;
   2865	unsigned long counters;
   2866	void *freelist;
   2867
   2868	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
   2869
   2870	do {
   2871		freelist = slab->freelist;
   2872		counters = slab->counters;
   2873
   2874		new.counters = counters;
   2875		VM_BUG_ON(!new.frozen);
   2876
   2877		new.inuse = slab->objects;
   2878		new.frozen = freelist != NULL;
   2879
   2880	} while (!__cmpxchg_double_slab(s, slab,
   2881		freelist, counters,
   2882		NULL, new.counters,
   2883		"get_freelist"));
   2884
   2885	return freelist;
   2886}
   2887
   2888/*
   2889 * Slow path. The lockless freelist is empty or we need to perform
   2890 * debugging duties.
   2891 *
   2892 * Processing is still very fast if new objects have been freed to the
   2893 * regular freelist. In that case we simply take over the regular freelist
   2894 * as the lockless freelist and zap the regular freelist.
   2895 *
   2896 * If that is not working then we fall back to the partial lists. We take the
   2897 * first element of the freelist as the object to allocate now and move the
   2898 * rest of the freelist to the lockless freelist.
   2899 *
   2900 * And if we were unable to get a new slab from the partial slab lists then
   2901 * we need to allocate a new slab. This is the slowest path since it involves
   2902 * a call to the page allocator and the setup of a new slab.
   2903 *
   2904 * Version of __slab_alloc to use when we know that preemption is
   2905 * already disabled (which is the case for bulk allocation).
   2906 */
   2907static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
   2908			  unsigned long addr, struct kmem_cache_cpu *c)
   2909{
   2910	void *freelist;
   2911	struct slab *slab;
   2912	unsigned long flags;
   2913
   2914	stat(s, ALLOC_SLOWPATH);
   2915
   2916reread_slab:
   2917
   2918	slab = READ_ONCE(c->slab);
   2919	if (!slab) {
   2920		/*
   2921		 * if the node is not online or has no normal memory, just
   2922		 * ignore the node constraint
   2923		 */
   2924		if (unlikely(node != NUMA_NO_NODE &&
   2925			     !node_isset(node, slab_nodes)))
   2926			node = NUMA_NO_NODE;
   2927		goto new_slab;
   2928	}
   2929redo:
   2930
   2931	if (unlikely(!node_match(slab, node))) {
   2932		/*
   2933		 * same as above but node_match() being false already
   2934		 * implies node != NUMA_NO_NODE
   2935		 */
   2936		if (!node_isset(node, slab_nodes)) {
   2937			node = NUMA_NO_NODE;
   2938		} else {
   2939			stat(s, ALLOC_NODE_MISMATCH);
   2940			goto deactivate_slab;
   2941		}
   2942	}
   2943
   2944	/*
   2945	 * By rights, we should be searching for a slab page that was
   2946	 * PFMEMALLOC but right now, we are losing the pfmemalloc
   2947	 * information when the page leaves the per-cpu allocator
   2948	 */
   2949	if (unlikely(!pfmemalloc_match(slab, gfpflags)))
   2950		goto deactivate_slab;
   2951
   2952	/* must check again c->slab in case we got preempted and it changed */
   2953	local_lock_irqsave(&s->cpu_slab->lock, flags);
   2954	if (unlikely(slab != c->slab)) {
   2955		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   2956		goto reread_slab;
   2957	}
   2958	freelist = c->freelist;
   2959	if (freelist)
   2960		goto load_freelist;
   2961
   2962	freelist = get_freelist(s, slab);
   2963
   2964	if (!freelist) {
   2965		c->slab = NULL;
   2966		c->tid = next_tid(c->tid);
   2967		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   2968		stat(s, DEACTIVATE_BYPASS);
   2969		goto new_slab;
   2970	}
   2971
   2972	stat(s, ALLOC_REFILL);
   2973
   2974load_freelist:
   2975
   2976	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
   2977
   2978	/*
   2979	 * freelist is pointing to the list of objects to be used.
   2980	 * slab is pointing to the slab from which the objects are obtained.
   2981	 * That slab must be frozen for per cpu allocations to work.
   2982	 */
   2983	VM_BUG_ON(!c->slab->frozen);
   2984	c->freelist = get_freepointer(s, freelist);
   2985	c->tid = next_tid(c->tid);
   2986	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   2987	return freelist;
   2988
   2989deactivate_slab:
   2990
   2991	local_lock_irqsave(&s->cpu_slab->lock, flags);
   2992	if (slab != c->slab) {
   2993		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   2994		goto reread_slab;
   2995	}
   2996	freelist = c->freelist;
   2997	c->slab = NULL;
   2998	c->freelist = NULL;
   2999	c->tid = next_tid(c->tid);
   3000	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   3001	deactivate_slab(s, slab, freelist);
   3002
   3003new_slab:
   3004
   3005	if (slub_percpu_partial(c)) {
   3006		local_lock_irqsave(&s->cpu_slab->lock, flags);
   3007		if (unlikely(c->slab)) {
   3008			local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   3009			goto reread_slab;
   3010		}
   3011		if (unlikely(!slub_percpu_partial(c))) {
   3012			local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   3013			/* we were preempted and partial list got empty */
   3014			goto new_objects;
   3015		}
   3016
   3017		slab = c->slab = slub_percpu_partial(c);
   3018		slub_set_percpu_partial(c, slab);
   3019		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   3020		stat(s, CPU_PARTIAL_ALLOC);
   3021		goto redo;
   3022	}
   3023
   3024new_objects:
   3025
   3026	freelist = get_partial(s, gfpflags, node, &slab);
   3027	if (freelist)
   3028		goto check_new_slab;
   3029
   3030	slub_put_cpu_ptr(s->cpu_slab);
   3031	slab = new_slab(s, gfpflags, node);
   3032	c = slub_get_cpu_ptr(s->cpu_slab);
   3033
   3034	if (unlikely(!slab)) {
   3035		slab_out_of_memory(s, gfpflags, node);
   3036		return NULL;
   3037	}
   3038
   3039	/*
   3040	 * No other reference to the slab yet so we can
   3041	 * muck around with it freely without cmpxchg
   3042	 */
   3043	freelist = slab->freelist;
   3044	slab->freelist = NULL;
   3045
   3046	stat(s, ALLOC_SLAB);
   3047
   3048check_new_slab:
   3049
   3050	if (kmem_cache_debug(s)) {
   3051		if (!alloc_debug_processing(s, slab, freelist, addr)) {
   3052			/* Slab failed checks. Next slab needed */
   3053			goto new_slab;
   3054		} else {
   3055			/*
   3056			 * For debug case, we don't load freelist so that all
   3057			 * allocations go through alloc_debug_processing()
   3058			 */
   3059			goto return_single;
   3060		}
   3061	}
   3062
   3063	if (unlikely(!pfmemalloc_match(slab, gfpflags)))
   3064		/*
   3065		 * For !pfmemalloc_match() case we don't load freelist so that
   3066		 * we don't make further mismatched allocations easier.
   3067		 */
   3068		goto return_single;
   3069
   3070retry_load_slab:
   3071
   3072	local_lock_irqsave(&s->cpu_slab->lock, flags);
   3073	if (unlikely(c->slab)) {
   3074		void *flush_freelist = c->freelist;
   3075		struct slab *flush_slab = c->slab;
   3076
   3077		c->slab = NULL;
   3078		c->freelist = NULL;
   3079		c->tid = next_tid(c->tid);
   3080
   3081		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
   3082
   3083		deactivate_slab(s, flush_slab, flush_freelist);
   3084
   3085		stat(s, CPUSLAB_FLUSH);
   3086
   3087		goto retry_load_slab;
   3088	}
   3089	c->slab = slab;
   3090
   3091	goto load_freelist;
   3092
   3093return_single:
   3094
   3095	deactivate_slab(s, slab, get_freepointer(s, freelist));
   3096	return freelist;
   3097}
   3098
   3099/*
   3100 * A wrapper for ___slab_alloc() for contexts where preemption is not yet
   3101 * disabled. Compensates for possible cpu changes by refetching the per cpu area
   3102 * pointer.
   3103 */
   3104static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
   3105			  unsigned long addr, struct kmem_cache_cpu *c)
   3106{
   3107	void *p;
   3108
   3109#ifdef CONFIG_PREEMPT_COUNT
   3110	/*
   3111	 * We may have been preempted and rescheduled on a different
   3112	 * cpu before disabling preemption. Need to reload cpu area
   3113	 * pointer.
   3114	 */
   3115	c = slub_get_cpu_ptr(s->cpu_slab);
   3116#endif
   3117
   3118	p = ___slab_alloc(s, gfpflags, node, addr, c);
   3119#ifdef CONFIG_PREEMPT_COUNT
   3120	slub_put_cpu_ptr(s->cpu_slab);
   3121#endif
   3122	return p;
   3123}
   3124
   3125/*
   3126 * If the object has been wiped upon free, make sure it's fully initialized by
   3127 * zeroing out freelist pointer.
   3128 */
   3129static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
   3130						   void *obj)
   3131{
   3132	if (unlikely(slab_want_init_on_free(s)) && obj)
   3133		memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
   3134			0, sizeof(void *));
   3135}
   3136
   3137/*
   3138 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
   3139 * have the fastpath folded into their functions. So no function call
   3140 * overhead for requests that can be satisfied on the fastpath.
   3141 *
   3142 * The fastpath works by first checking if the lockless freelist can be used.
   3143 * If not then __slab_alloc is called for slow processing.
   3144 *
   3145 * Otherwise we can simply pick the next object from the lockless free list.
   3146 */
   3147static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
   3148		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
   3149{
   3150	void *object;
   3151	struct kmem_cache_cpu *c;
   3152	struct slab *slab;
   3153	unsigned long tid;
   3154	struct obj_cgroup *objcg = NULL;
   3155	bool init = false;
   3156
   3157	s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
   3158	if (!s)
   3159		return NULL;
   3160
   3161	object = kfence_alloc(s, orig_size, gfpflags);
   3162	if (unlikely(object))
   3163		goto out;
   3164
   3165redo:
   3166	/*
   3167	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
   3168	 * enabled. We may switch back and forth between cpus while
   3169	 * reading from one cpu area. That does not matter as long
   3170	 * as we end up on the original cpu again when doing the cmpxchg.
   3171	 *
   3172	 * We must guarantee that tid and kmem_cache_cpu are retrieved on the
   3173	 * same cpu. We read first the kmem_cache_cpu pointer and use it to read
   3174	 * the tid. If we are preempted and switched to another cpu between the
   3175	 * two reads, it's OK as the two are still associated with the same cpu
   3176	 * and cmpxchg later will validate the cpu.
   3177	 */
   3178	c = raw_cpu_ptr(s->cpu_slab);
   3179	tid = READ_ONCE(c->tid);
   3180
   3181	/*
   3182	 * Irqless object alloc/free algorithm used here depends on sequence
   3183	 * of fetching cpu_slab's data. tid should be fetched before anything
   3184	 * on c to guarantee that object and slab associated with previous tid
   3185	 * won't be used with current tid. If we fetch tid first, object and
   3186	 * slab could be one associated with next tid and our alloc/free
   3187	 * request will be failed. In this case, we will retry. So, no problem.
   3188	 */
   3189	barrier();
   3190
   3191	/*
   3192	 * The transaction ids are globally unique per cpu and per operation on
   3193	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
   3194	 * occurs on the right processor and that there was no operation on the
   3195	 * linked list in between.
   3196	 */
   3197
   3198	object = c->freelist;
   3199	slab = c->slab;
   3200	/*
   3201	 * We cannot use the lockless fastpath on PREEMPT_RT because if a
   3202	 * slowpath has taken the local_lock_irqsave(), it is not protected
   3203	 * against a fast path operation in an irq handler. So we need to take
   3204	 * the slow path which uses local_lock. It is still relatively fast if
   3205	 * there is a suitable cpu freelist.
   3206	 */
   3207	if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
   3208	    unlikely(!object || !slab || !node_match(slab, node))) {
   3209		object = __slab_alloc(s, gfpflags, node, addr, c);
   3210	} else {
   3211		void *next_object = get_freepointer_safe(s, object);
   3212
   3213		/*
   3214		 * The cmpxchg will only match if there was no additional
   3215		 * operation and if we are on the right processor.
   3216		 *
   3217		 * The cmpxchg does the following atomically (without lock
   3218		 * semantics!)
   3219		 * 1. Relocate first pointer to the current per cpu area.
   3220		 * 2. Verify that tid and freelist have not been changed
   3221		 * 3. If they were not changed replace tid and freelist
   3222		 *
   3223		 * Since this is without lock semantics the protection is only
   3224		 * against code executing on this cpu *not* from access by
   3225		 * other cpus.
   3226		 */
   3227		if (unlikely(!this_cpu_cmpxchg_double(
   3228				s->cpu_slab->freelist, s->cpu_slab->tid,
   3229				object, tid,
   3230				next_object, next_tid(tid)))) {
   3231
   3232			note_cmpxchg_failure("slab_alloc", s, tid);
   3233			goto redo;
   3234		}
   3235		prefetch_freepointer(s, next_object);
   3236		stat(s, ALLOC_FASTPATH);
   3237	}
   3238
   3239	maybe_wipe_obj_freeptr(s, object);
   3240	init = slab_want_init_on_alloc(gfpflags, s);
   3241
   3242out:
   3243	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
   3244
   3245	return object;
   3246}
   3247
   3248static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru,
   3249		gfp_t gfpflags, unsigned long addr, size_t orig_size)
   3250{
   3251	return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size);
   3252}
   3253
   3254static __always_inline
   3255void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
   3256			     gfp_t gfpflags)
   3257{
   3258	void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size);
   3259
   3260	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
   3261				s->size, gfpflags);
   3262
   3263	return ret;
   3264}
   3265
   3266void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
   3267{
   3268	return __kmem_cache_alloc_lru(s, NULL, gfpflags);
   3269}
   3270EXPORT_SYMBOL(kmem_cache_alloc);
   3271
   3272void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
   3273			   gfp_t gfpflags)
   3274{
   3275	return __kmem_cache_alloc_lru(s, lru, gfpflags);
   3276}
   3277EXPORT_SYMBOL(kmem_cache_alloc_lru);
   3278
   3279#ifdef CONFIG_TRACING
   3280void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
   3281{
   3282	void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size);
   3283	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
   3284	ret = kasan_kmalloc(s, ret, size, gfpflags);
   3285	return ret;
   3286}
   3287EXPORT_SYMBOL(kmem_cache_alloc_trace);
   3288#endif
   3289
   3290#ifdef CONFIG_NUMA
   3291void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
   3292{
   3293	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
   3294
   3295	trace_kmem_cache_alloc_node(_RET_IP_, ret,
   3296				    s->object_size, s->size, gfpflags, node);
   3297
   3298	return ret;
   3299}
   3300EXPORT_SYMBOL(kmem_cache_alloc_node);
   3301
   3302#ifdef CONFIG_TRACING
   3303void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
   3304				    gfp_t gfpflags,
   3305				    int node, size_t size)
   3306{
   3307	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
   3308
   3309	trace_kmalloc_node(_RET_IP_, ret,
   3310			   size, s->size, gfpflags, node);
   3311
   3312	ret = kasan_kmalloc(s, ret, size, gfpflags);
   3313	return ret;
   3314}
   3315EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
   3316#endif
   3317#endif	/* CONFIG_NUMA */
   3318
   3319/*
   3320 * Slow path handling. This may still be called frequently since objects
   3321 * have a longer lifetime than the cpu slabs in most processing loads.
   3322 *
   3323 * So we still attempt to reduce cache line usage. Just take the slab
   3324 * lock and free the item. If there is no additional partial slab
   3325 * handling required then we can return immediately.
   3326 */
   3327static void __slab_free(struct kmem_cache *s, struct slab *slab,
   3328			void *head, void *tail, int cnt,
   3329			unsigned long addr)
   3330
   3331{
   3332	void *prior;
   3333	int was_frozen;
   3334	struct slab new;
   3335	unsigned long counters;
   3336	struct kmem_cache_node *n = NULL;
   3337	unsigned long flags;
   3338
   3339	stat(s, FREE_SLOWPATH);
   3340
   3341	if (kfence_free(head))
   3342		return;
   3343
   3344	if (kmem_cache_debug(s) &&
   3345	    !free_debug_processing(s, slab, head, tail, cnt, addr))
   3346		return;
   3347
   3348	do {
   3349		if (unlikely(n)) {
   3350			spin_unlock_irqrestore(&n->list_lock, flags);
   3351			n = NULL;
   3352		}
   3353		prior = slab->freelist;
   3354		counters = slab->counters;
   3355		set_freepointer(s, tail, prior);
   3356		new.counters = counters;
   3357		was_frozen = new.frozen;
   3358		new.inuse -= cnt;
   3359		if ((!new.inuse || !prior) && !was_frozen) {
   3360
   3361			if (kmem_cache_has_cpu_partial(s) && !prior) {
   3362
   3363				/*
   3364				 * Slab was on no list before and will be
   3365				 * partially empty
   3366				 * We can defer the list move and instead
   3367				 * freeze it.
   3368				 */
   3369				new.frozen = 1;
   3370
   3371			} else { /* Needs to be taken off a list */
   3372
   3373				n = get_node(s, slab_nid(slab));
   3374				/*
   3375				 * Speculatively acquire the list_lock.
   3376				 * If the cmpxchg does not succeed then we may
   3377				 * drop the list_lock without any processing.
   3378				 *
   3379				 * Otherwise the list_lock will synchronize with
   3380				 * other processors updating the list of slabs.
   3381				 */
   3382				spin_lock_irqsave(&n->list_lock, flags);
   3383
   3384			}
   3385		}
   3386
   3387	} while (!cmpxchg_double_slab(s, slab,
   3388		prior, counters,
   3389		head, new.counters,
   3390		"__slab_free"));
   3391
   3392	if (likely(!n)) {
   3393
   3394		if (likely(was_frozen)) {
   3395			/*
   3396			 * The list lock was not taken therefore no list
   3397			 * activity can be necessary.
   3398			 */
   3399			stat(s, FREE_FROZEN);
   3400		} else if (new.frozen) {
   3401			/*
   3402			 * If we just froze the slab then put it onto the
   3403			 * per cpu partial list.
   3404			 */
   3405			put_cpu_partial(s, slab, 1);
   3406			stat(s, CPU_PARTIAL_FREE);
   3407		}
   3408
   3409		return;
   3410	}
   3411
   3412	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
   3413		goto slab_empty;
   3414
   3415	/*
   3416	 * Objects left in the slab. If it was not on the partial list before
   3417	 * then add it.
   3418	 */
   3419	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
   3420		remove_full(s, n, slab);
   3421		add_partial(n, slab, DEACTIVATE_TO_TAIL);
   3422		stat(s, FREE_ADD_PARTIAL);
   3423	}
   3424	spin_unlock_irqrestore(&n->list_lock, flags);
   3425	return;
   3426
   3427slab_empty:
   3428	if (prior) {
   3429		/*
   3430		 * Slab on the partial list.
   3431		 */
   3432		remove_partial(n, slab);
   3433		stat(s, FREE_REMOVE_PARTIAL);
   3434	} else {
   3435		/* Slab must be on the full list */
   3436		remove_full(s, n, slab);
   3437	}
   3438
   3439	spin_unlock_irqrestore(&n->list_lock, flags);
   3440	stat(s, FREE_SLAB);
   3441	discard_slab(s, slab);
   3442}
   3443
   3444/*
   3445 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
   3446 * can perform fastpath freeing without additional function calls.
   3447 *
   3448 * The fastpath is only possible if we are freeing to the current cpu slab
   3449 * of this processor. This typically the case if we have just allocated
   3450 * the item before.
   3451 *
   3452 * If fastpath is not possible then fall back to __slab_free where we deal
   3453 * with all sorts of special processing.
   3454 *
   3455 * Bulk free of a freelist with several objects (all pointing to the
   3456 * same slab) possible by specifying head and tail ptr, plus objects
   3457 * count (cnt). Bulk free indicated by tail pointer being set.
   3458 */
   3459static __always_inline void do_slab_free(struct kmem_cache *s,
   3460				struct slab *slab, void *head, void *tail,
   3461				int cnt, unsigned long addr)
   3462{
   3463	void *tail_obj = tail ? : head;
   3464	struct kmem_cache_cpu *c;
   3465	unsigned long tid;
   3466
   3467	/* memcg_slab_free_hook() is already called for bulk free. */
   3468	if (!tail)
   3469		memcg_slab_free_hook(s, &head, 1);
   3470redo:
   3471	/*
   3472	 * Determine the currently cpus per cpu slab.
   3473	 * The cpu may change afterward. However that does not matter since
   3474	 * data is retrieved via this pointer. If we are on the same cpu
   3475	 * during the cmpxchg then the free will succeed.
   3476	 */
   3477	c = raw_cpu_ptr(s->cpu_slab);
   3478	tid = READ_ONCE(c->tid);
   3479
   3480	/* Same with comment on barrier() in slab_alloc_node() */
   3481	barrier();
   3482
   3483	if (likely(slab == c->slab)) {
   3484#ifndef CONFIG_PREEMPT_RT
   3485		void **freelist = READ_ONCE(c->freelist);
   3486
   3487		set_freepointer(s, tail_obj, freelist);
   3488
   3489		if (unlikely(!this_cpu_cmpxchg_double(
   3490				s->cpu_slab->freelist, s->cpu_slab->tid,
   3491				freelist, tid,
   3492				head, next_tid(tid)))) {
   3493
   3494			note_cmpxchg_failure("slab_free", s, tid);
   3495			goto redo;
   3496		}
   3497#else /* CONFIG_PREEMPT_RT */
   3498		/*
   3499		 * We cannot use the lockless fastpath on PREEMPT_RT because if
   3500		 * a slowpath has taken the local_lock_irqsave(), it is not
   3501		 * protected against a fast path operation in an irq handler. So
   3502		 * we need to take the local_lock. We shouldn't simply defer to
   3503		 * __slab_free() as that wouldn't use the cpu freelist at all.
   3504		 */
   3505		void **freelist;
   3506
   3507		local_lock(&s->cpu_slab->lock);
   3508		c = this_cpu_ptr(s->cpu_slab);
   3509		if (unlikely(slab != c->slab)) {
   3510			local_unlock(&s->cpu_slab->lock);
   3511			goto redo;
   3512		}
   3513		tid = c->tid;
   3514		freelist = c->freelist;
   3515
   3516		set_freepointer(s, tail_obj, freelist);
   3517		c->freelist = head;
   3518		c->tid = next_tid(tid);
   3519
   3520		local_unlock(&s->cpu_slab->lock);
   3521#endif
   3522		stat(s, FREE_FASTPATH);
   3523	} else
   3524		__slab_free(s, slab, head, tail_obj, cnt, addr);
   3525
   3526}
   3527
   3528static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
   3529				      void *head, void *tail, int cnt,
   3530				      unsigned long addr)
   3531{
   3532	/*
   3533	 * With KASAN enabled slab_free_freelist_hook modifies the freelist
   3534	 * to remove objects, whose reuse must be delayed.
   3535	 */
   3536	if (slab_free_freelist_hook(s, &head, &tail, &cnt))
   3537		do_slab_free(s, slab, head, tail, cnt, addr);
   3538}
   3539
   3540#ifdef CONFIG_KASAN_GENERIC
   3541void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
   3542{
   3543	do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr);
   3544}
   3545#endif
   3546
   3547void kmem_cache_free(struct kmem_cache *s, void *x)
   3548{
   3549	s = cache_from_obj(s, x);
   3550	if (!s)
   3551		return;
   3552	trace_kmem_cache_free(_RET_IP_, x, s->name);
   3553	slab_free(s, virt_to_slab(x), x, NULL, 1, _RET_IP_);
   3554}
   3555EXPORT_SYMBOL(kmem_cache_free);
   3556
   3557struct detached_freelist {
   3558	struct slab *slab;
   3559	void *tail;
   3560	void *freelist;
   3561	int cnt;
   3562	struct kmem_cache *s;
   3563};
   3564
   3565static inline void free_large_kmalloc(struct folio *folio, void *object)
   3566{
   3567	unsigned int order = folio_order(folio);
   3568
   3569	if (WARN_ON_ONCE(order == 0))
   3570		pr_warn_once("object pointer: 0x%p\n", object);
   3571
   3572	kfree_hook(object);
   3573	mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
   3574			      -(PAGE_SIZE << order));
   3575	__free_pages(folio_page(folio, 0), order);
   3576}
   3577
   3578/*
   3579 * This function progressively scans the array with free objects (with
   3580 * a limited look ahead) and extract objects belonging to the same
   3581 * slab.  It builds a detached freelist directly within the given
   3582 * slab/objects.  This can happen without any need for
   3583 * synchronization, because the objects are owned by running process.
   3584 * The freelist is build up as a single linked list in the objects.
   3585 * The idea is, that this detached freelist can then be bulk
   3586 * transferred to the real freelist(s), but only requiring a single
   3587 * synchronization primitive.  Look ahead in the array is limited due
   3588 * to performance reasons.
   3589 */
   3590static inline
   3591int build_detached_freelist(struct kmem_cache *s, size_t size,
   3592			    void **p, struct detached_freelist *df)
   3593{
   3594	size_t first_skipped_index = 0;
   3595	int lookahead = 3;
   3596	void *object;
   3597	struct folio *folio;
   3598	struct slab *slab;
   3599
   3600	/* Always re-init detached_freelist */
   3601	df->slab = NULL;
   3602
   3603	do {
   3604		object = p[--size];
   3605		/* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
   3606	} while (!object && size);
   3607
   3608	if (!object)
   3609		return 0;
   3610
   3611	folio = virt_to_folio(object);
   3612	if (!s) {
   3613		/* Handle kalloc'ed objects */
   3614		if (unlikely(!folio_test_slab(folio))) {
   3615			free_large_kmalloc(folio, object);
   3616			p[size] = NULL; /* mark object processed */
   3617			return size;
   3618		}
   3619		/* Derive kmem_cache from object */
   3620		slab = folio_slab(folio);
   3621		df->s = slab->slab_cache;
   3622	} else {
   3623		slab = folio_slab(folio);
   3624		df->s = cache_from_obj(s, object); /* Support for memcg */
   3625	}
   3626
   3627	if (is_kfence_address(object)) {
   3628		slab_free_hook(df->s, object, false);
   3629		__kfence_free(object);
   3630		p[size] = NULL; /* mark object processed */
   3631		return size;
   3632	}
   3633
   3634	/* Start new detached freelist */
   3635	df->slab = slab;
   3636	set_freepointer(df->s, object, NULL);
   3637	df->tail = object;
   3638	df->freelist = object;
   3639	p[size] = NULL; /* mark object processed */
   3640	df->cnt = 1;
   3641
   3642	while (size) {
   3643		object = p[--size];
   3644		if (!object)
   3645			continue; /* Skip processed objects */
   3646
   3647		/* df->slab is always set at this point */
   3648		if (df->slab == virt_to_slab(object)) {
   3649			/* Opportunity build freelist */
   3650			set_freepointer(df->s, object, df->freelist);
   3651			df->freelist = object;
   3652			df->cnt++;
   3653			p[size] = NULL; /* mark object processed */
   3654
   3655			continue;
   3656		}
   3657
   3658		/* Limit look ahead search */
   3659		if (!--lookahead)
   3660			break;
   3661
   3662		if (!first_skipped_index)
   3663			first_skipped_index = size + 1;
   3664	}
   3665
   3666	return first_skipped_index;
   3667}
   3668
   3669/* Note that interrupts must be enabled when calling this function. */
   3670void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
   3671{
   3672	if (WARN_ON(!size))
   3673		return;
   3674
   3675	memcg_slab_free_hook(s, p, size);
   3676	do {
   3677		struct detached_freelist df;
   3678
   3679		size = build_detached_freelist(s, size, p, &df);
   3680		if (!df.slab)
   3681			continue;
   3682
   3683		slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_);
   3684	} while (likely(size));
   3685}
   3686EXPORT_SYMBOL(kmem_cache_free_bulk);
   3687
   3688/* Note that interrupts must be enabled when calling this function. */
   3689int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
   3690			  void **p)
   3691{
   3692	struct kmem_cache_cpu *c;
   3693	int i;
   3694	struct obj_cgroup *objcg = NULL;
   3695
   3696	/* memcg and kmem_cache debug support */
   3697	s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
   3698	if (unlikely(!s))
   3699		return false;
   3700	/*
   3701	 * Drain objects in the per cpu slab, while disabling local
   3702	 * IRQs, which protects against PREEMPT and interrupts
   3703	 * handlers invoking normal fastpath.
   3704	 */
   3705	c = slub_get_cpu_ptr(s->cpu_slab);
   3706	local_lock_irq(&s->cpu_slab->lock);
   3707
   3708	for (i = 0; i < size; i++) {
   3709		void *object = kfence_alloc(s, s->object_size, flags);
   3710
   3711		if (unlikely(object)) {
   3712			p[i] = object;
   3713			continue;
   3714		}
   3715
   3716		object = c->freelist;
   3717		if (unlikely(!object)) {
   3718			/*
   3719			 * We may have removed an object from c->freelist using
   3720			 * the fastpath in the previous iteration; in that case,
   3721			 * c->tid has not been bumped yet.
   3722			 * Since ___slab_alloc() may reenable interrupts while
   3723			 * allocating memory, we should bump c->tid now.
   3724			 */
   3725			c->tid = next_tid(c->tid);
   3726
   3727			local_unlock_irq(&s->cpu_slab->lock);
   3728
   3729			/*
   3730			 * Invoking slow path likely have side-effect
   3731			 * of re-populating per CPU c->freelist
   3732			 */
   3733			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
   3734					    _RET_IP_, c);
   3735			if (unlikely(!p[i]))
   3736				goto error;
   3737
   3738			c = this_cpu_ptr(s->cpu_slab);
   3739			maybe_wipe_obj_freeptr(s, p[i]);
   3740
   3741			local_lock_irq(&s->cpu_slab->lock);
   3742
   3743			continue; /* goto for-loop */
   3744		}
   3745		c->freelist = get_freepointer(s, object);
   3746		p[i] = object;
   3747		maybe_wipe_obj_freeptr(s, p[i]);
   3748	}
   3749	c->tid = next_tid(c->tid);
   3750	local_unlock_irq(&s->cpu_slab->lock);
   3751	slub_put_cpu_ptr(s->cpu_slab);
   3752
   3753	/*
   3754	 * memcg and kmem_cache debug support and memory initialization.
   3755	 * Done outside of the IRQ disabled fastpath loop.
   3756	 */
   3757	slab_post_alloc_hook(s, objcg, flags, size, p,
   3758				slab_want_init_on_alloc(flags, s));
   3759	return i;
   3760error:
   3761	slub_put_cpu_ptr(s->cpu_slab);
   3762	slab_post_alloc_hook(s, objcg, flags, i, p, false);
   3763	__kmem_cache_free_bulk(s, i, p);
   3764	return 0;
   3765}
   3766EXPORT_SYMBOL(kmem_cache_alloc_bulk);
   3767
   3768
   3769/*
   3770 * Object placement in a slab is made very easy because we always start at
   3771 * offset 0. If we tune the size of the object to the alignment then we can
   3772 * get the required alignment by putting one properly sized object after
   3773 * another.
   3774 *
   3775 * Notice that the allocation order determines the sizes of the per cpu
   3776 * caches. Each processor has always one slab available for allocations.
   3777 * Increasing the allocation order reduces the number of times that slabs
   3778 * must be moved on and off the partial lists and is therefore a factor in
   3779 * locking overhead.
   3780 */
   3781
   3782/*
   3783 * Minimum / Maximum order of slab pages. This influences locking overhead
   3784 * and slab fragmentation. A higher order reduces the number of partial slabs
   3785 * and increases the number of allocations possible without having to
   3786 * take the list_lock.
   3787 */
   3788static unsigned int slub_min_order;
   3789static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
   3790static unsigned int slub_min_objects;
   3791
   3792/*
   3793 * Calculate the order of allocation given an slab object size.
   3794 *
   3795 * The order of allocation has significant impact on performance and other
   3796 * system components. Generally order 0 allocations should be preferred since
   3797 * order 0 does not cause fragmentation in the page allocator. Larger objects
   3798 * be problematic to put into order 0 slabs because there may be too much
   3799 * unused space left. We go to a higher order if more than 1/16th of the slab
   3800 * would be wasted.
   3801 *
   3802 * In order to reach satisfactory performance we must ensure that a minimum
   3803 * number of objects is in one slab. Otherwise we may generate too much
   3804 * activity on the partial lists which requires taking the list_lock. This is
   3805 * less a concern for large slabs though which are rarely used.
   3806 *
   3807 * slub_max_order specifies the order where we begin to stop considering the
   3808 * number of objects in a slab as critical. If we reach slub_max_order then
   3809 * we try to keep the page order as low as possible. So we accept more waste
   3810 * of space in favor of a small page order.
   3811 *
   3812 * Higher order allocations also allow the placement of more objects in a
   3813 * slab and thereby reduce object handling overhead. If the user has
   3814 * requested a higher minimum order then we start with that one instead of
   3815 * the smallest order which will fit the object.
   3816 */
   3817static inline unsigned int calc_slab_order(unsigned int size,
   3818		unsigned int min_objects, unsigned int max_order,
   3819		unsigned int fract_leftover)
   3820{
   3821	unsigned int min_order = slub_min_order;
   3822	unsigned int order;
   3823
   3824	if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
   3825		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
   3826
   3827	for (order = max(min_order, (unsigned int)get_order(min_objects * size));
   3828			order <= max_order; order++) {
   3829
   3830		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
   3831		unsigned int rem;
   3832
   3833		rem = slab_size % size;
   3834
   3835		if (rem <= slab_size / fract_leftover)
   3836			break;
   3837	}
   3838
   3839	return order;
   3840}
   3841
   3842static inline int calculate_order(unsigned int size)
   3843{
   3844	unsigned int order;
   3845	unsigned int min_objects;
   3846	unsigned int max_objects;
   3847	unsigned int nr_cpus;
   3848
   3849	/*
   3850	 * Attempt to find best configuration for a slab. This
   3851	 * works by first attempting to generate a layout with
   3852	 * the best configuration and backing off gradually.
   3853	 *
   3854	 * First we increase the acceptable waste in a slab. Then
   3855	 * we reduce the minimum objects required in a slab.
   3856	 */
   3857	min_objects = slub_min_objects;
   3858	if (!min_objects) {
   3859		/*
   3860		 * Some architectures will only update present cpus when
   3861		 * onlining them, so don't trust the number if it's just 1. But
   3862		 * we also don't want to use nr_cpu_ids always, as on some other
   3863		 * architectures, there can be many possible cpus, but never
   3864		 * onlined. Here we compromise between trying to avoid too high
   3865		 * order on systems that appear larger than they are, and too
   3866		 * low order on systems that appear smaller than they are.
   3867		 */
   3868		nr_cpus = num_present_cpus();
   3869		if (nr_cpus <= 1)
   3870			nr_cpus = nr_cpu_ids;
   3871		min_objects = 4 * (fls(nr_cpus) + 1);
   3872	}
   3873	max_objects = order_objects(slub_max_order, size);
   3874	min_objects = min(min_objects, max_objects);
   3875
   3876	while (min_objects > 1) {
   3877		unsigned int fraction;
   3878
   3879		fraction = 16;
   3880		while (fraction >= 4) {
   3881			order = calc_slab_order(size, min_objects,
   3882					slub_max_order, fraction);
   3883			if (order <= slub_max_order)
   3884				return order;
   3885			fraction /= 2;
   3886		}
   3887		min_objects--;
   3888	}
   3889
   3890	/*
   3891	 * We were unable to place multiple objects in a slab. Now
   3892	 * lets see if we can place a single object there.
   3893	 */
   3894	order = calc_slab_order(size, 1, slub_max_order, 1);
   3895	if (order <= slub_max_order)
   3896		return order;
   3897
   3898	/*
   3899	 * Doh this slab cannot be placed using slub_max_order.
   3900	 */
   3901	order = calc_slab_order(size, 1, MAX_ORDER, 1);
   3902	if (order < MAX_ORDER)
   3903		return order;
   3904	return -ENOSYS;
   3905}
   3906
   3907static void
   3908init_kmem_cache_node(struct kmem_cache_node *n)
   3909{
   3910	n->nr_partial = 0;
   3911	spin_lock_init(&n->list_lock);
   3912	INIT_LIST_HEAD(&n->partial);
   3913#ifdef CONFIG_SLUB_DEBUG
   3914	atomic_long_set(&n->nr_slabs, 0);
   3915	atomic_long_set(&n->total_objects, 0);
   3916	INIT_LIST_HEAD(&n->full);
   3917#endif
   3918}
   3919
   3920static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
   3921{
   3922	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
   3923			KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
   3924
   3925	/*
   3926	 * Must align to double word boundary for the double cmpxchg
   3927	 * instructions to work; see __pcpu_double_call_return_bool().
   3928	 */
   3929	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
   3930				     2 * sizeof(void *));
   3931
   3932	if (!s->cpu_slab)
   3933		return 0;
   3934
   3935	init_kmem_cache_cpus(s);
   3936
   3937	return 1;
   3938}
   3939
   3940static struct kmem_cache *kmem_cache_node;
   3941
   3942/*
   3943 * No kmalloc_node yet so do it by hand. We know that this is the first
   3944 * slab on the node for this slabcache. There are no concurrent accesses
   3945 * possible.
   3946 *
   3947 * Note that this function only works on the kmem_cache_node
   3948 * when allocating for the kmem_cache_node. This is used for bootstrapping
   3949 * memory on a fresh node that has no slab structures yet.
   3950 */
   3951static void early_kmem_cache_node_alloc(int node)
   3952{
   3953	struct slab *slab;
   3954	struct kmem_cache_node *n;
   3955
   3956	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
   3957
   3958	slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
   3959
   3960	BUG_ON(!slab);
   3961	if (slab_nid(slab) != node) {
   3962		pr_err("SLUB: Unable to allocate memory from node %d\n", node);
   3963		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
   3964	}
   3965
   3966	n = slab->freelist;
   3967	BUG_ON(!n);
   3968#ifdef CONFIG_SLUB_DEBUG
   3969	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
   3970	init_tracking(kmem_cache_node, n);
   3971#endif
   3972	n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
   3973	slab->freelist = get_freepointer(kmem_cache_node, n);
   3974	slab->inuse = 1;
   3975	slab->frozen = 0;
   3976	kmem_cache_node->node[node] = n;
   3977	init_kmem_cache_node(n);
   3978	inc_slabs_node(kmem_cache_node, node, slab->objects);
   3979
   3980	/*
   3981	 * No locks need to be taken here as it has just been
   3982	 * initialized and there is no concurrent access.
   3983	 */
   3984	__add_partial(n, slab, DEACTIVATE_TO_HEAD);
   3985}
   3986
   3987static void free_kmem_cache_nodes(struct kmem_cache *s)
   3988{
   3989	int node;
   3990	struct kmem_cache_node *n;
   3991
   3992	for_each_kmem_cache_node(s, node, n) {
   3993		s->node[node] = NULL;
   3994		kmem_cache_free(kmem_cache_node, n);
   3995	}
   3996}
   3997
   3998void __kmem_cache_release(struct kmem_cache *s)
   3999{
   4000	cache_random_seq_destroy(s);
   4001	free_percpu(s->cpu_slab);
   4002	free_kmem_cache_nodes(s);
   4003}
   4004
   4005static int init_kmem_cache_nodes(struct kmem_cache *s)
   4006{
   4007	int node;
   4008
   4009	for_each_node_mask(node, slab_nodes) {
   4010		struct kmem_cache_node *n;
   4011
   4012		if (slab_state == DOWN) {
   4013			early_kmem_cache_node_alloc(node);
   4014			continue;
   4015		}
   4016		n = kmem_cache_alloc_node(kmem_cache_node,
   4017						GFP_KERNEL, node);
   4018
   4019		if (!n) {
   4020			free_kmem_cache_nodes(s);
   4021			return 0;
   4022		}
   4023
   4024		init_kmem_cache_node(n);
   4025		s->node[node] = n;
   4026	}
   4027	return 1;
   4028}
   4029
   4030static void set_cpu_partial(struct kmem_cache *s)
   4031{
   4032#ifdef CONFIG_SLUB_CPU_PARTIAL
   4033	unsigned int nr_objects;
   4034
   4035	/*
   4036	 * cpu_partial determined the maximum number of objects kept in the
   4037	 * per cpu partial lists of a processor.
   4038	 *
   4039	 * Per cpu partial lists mainly contain slabs that just have one
   4040	 * object freed. If they are used for allocation then they can be
   4041	 * filled up again with minimal effort. The slab will never hit the
   4042	 * per node partial lists and therefore no locking will be required.
   4043	 *
   4044	 * For backwards compatibility reasons, this is determined as number
   4045	 * of objects, even though we now limit maximum number of pages, see
   4046	 * slub_set_cpu_partial()
   4047	 */
   4048	if (!kmem_cache_has_cpu_partial(s))
   4049		nr_objects = 0;
   4050	else if (s->size >= PAGE_SIZE)
   4051		nr_objects = 6;
   4052	else if (s->size >= 1024)
   4053		nr_objects = 24;
   4054	else if (s->size >= 256)
   4055		nr_objects = 52;
   4056	else
   4057		nr_objects = 120;
   4058
   4059	slub_set_cpu_partial(s, nr_objects);
   4060#endif
   4061}
   4062
   4063/*
   4064 * calculate_sizes() determines the order and the distribution of data within
   4065 * a slab object.
   4066 */
   4067static int calculate_sizes(struct kmem_cache *s)
   4068{
   4069	slab_flags_t flags = s->flags;
   4070	unsigned int size = s->object_size;
   4071	unsigned int order;
   4072
   4073	/*
   4074	 * Round up object size to the next word boundary. We can only
   4075	 * place the free pointer at word boundaries and this determines
   4076	 * the possible location of the free pointer.
   4077	 */
   4078	size = ALIGN(size, sizeof(void *));
   4079
   4080#ifdef CONFIG_SLUB_DEBUG
   4081	/*
   4082	 * Determine if we can poison the object itself. If the user of
   4083	 * the slab may touch the object after free or before allocation
   4084	 * then we should never poison the object itself.
   4085	 */
   4086	if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
   4087			!s->ctor)
   4088		s->flags |= __OBJECT_POISON;
   4089	else
   4090		s->flags &= ~__OBJECT_POISON;
   4091
   4092
   4093	/*
   4094	 * If we are Redzoning then check if there is some space between the
   4095	 * end of the object and the free pointer. If not then add an
   4096	 * additional word to have some bytes to store Redzone information.
   4097	 */
   4098	if ((flags & SLAB_RED_ZONE) && size == s->object_size)
   4099		size += sizeof(void *);
   4100#endif
   4101
   4102	/*
   4103	 * With that we have determined the number of bytes in actual use
   4104	 * by the object and redzoning.
   4105	 */
   4106	s->inuse = size;
   4107
   4108	if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
   4109	    ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
   4110	    s->ctor) {
   4111		/*
   4112		 * Relocate free pointer after the object if it is not
   4113		 * permitted to overwrite the first word of the object on
   4114		 * kmem_cache_free.
   4115		 *
   4116		 * This is the case if we do RCU, have a constructor or
   4117		 * destructor, are poisoning the objects, or are
   4118		 * redzoning an object smaller than sizeof(void *).
   4119		 *
   4120		 * The assumption that s->offset >= s->inuse means free
   4121		 * pointer is outside of the object is used in the
   4122		 * freeptr_outside_object() function. If that is no
   4123		 * longer true, the function needs to be modified.
   4124		 */
   4125		s->offset = size;
   4126		size += sizeof(void *);
   4127	} else {
   4128		/*
   4129		 * Store freelist pointer near middle of object to keep
   4130		 * it away from the edges of the object to avoid small
   4131		 * sized over/underflows from neighboring allocations.
   4132		 */
   4133		s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
   4134	}
   4135
   4136#ifdef CONFIG_SLUB_DEBUG
   4137	if (flags & SLAB_STORE_USER)
   4138		/*
   4139		 * Need to store information about allocs and frees after
   4140		 * the object.
   4141		 */
   4142		size += 2 * sizeof(struct track);
   4143#endif
   4144
   4145	kasan_cache_create(s, &size, &s->flags);
   4146#ifdef CONFIG_SLUB_DEBUG
   4147	if (flags & SLAB_RED_ZONE) {
   4148		/*
   4149		 * Add some empty padding so that we can catch
   4150		 * overwrites from earlier objects rather than let
   4151		 * tracking information or the free pointer be
   4152		 * corrupted if a user writes before the start
   4153		 * of the object.
   4154		 */
   4155		size += sizeof(void *);
   4156
   4157		s->red_left_pad = sizeof(void *);
   4158		s->red_left_pad = ALIGN(s->red_left_pad, s->align);
   4159		size += s->red_left_pad;
   4160	}
   4161#endif
   4162
   4163	/*
   4164	 * SLUB stores one object immediately after another beginning from
   4165	 * offset 0. In order to align the objects we have to simply size
   4166	 * each object to conform to the alignment.
   4167	 */
   4168	size = ALIGN(size, s->align);
   4169	s->size = size;
   4170	s->reciprocal_size = reciprocal_value(size);
   4171	order = calculate_order(size);
   4172
   4173	if ((int)order < 0)
   4174		return 0;
   4175
   4176	s->allocflags = 0;
   4177	if (order)
   4178		s->allocflags |= __GFP_COMP;
   4179
   4180	if (s->flags & SLAB_CACHE_DMA)
   4181		s->allocflags |= GFP_DMA;
   4182
   4183	if (s->flags & SLAB_CACHE_DMA32)
   4184		s->allocflags |= GFP_DMA32;
   4185
   4186	if (s->flags & SLAB_RECLAIM_ACCOUNT)
   4187		s->allocflags |= __GFP_RECLAIMABLE;
   4188
   4189	/*
   4190	 * Determine the number of objects per slab
   4191	 */
   4192	s->oo = oo_make(order, size);
   4193	s->min = oo_make(get_order(size), size);
   4194
   4195	return !!oo_objects(s->oo);
   4196}
   4197
   4198static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
   4199{
   4200	s->flags = kmem_cache_flags(s->size, flags, s->name);
   4201#ifdef CONFIG_SLAB_FREELIST_HARDENED
   4202	s->random = get_random_long();
   4203#endif
   4204
   4205	if (!calculate_sizes(s))
   4206		goto error;
   4207	if (disable_higher_order_debug) {
   4208		/*
   4209		 * Disable debugging flags that store metadata if the min slab
   4210		 * order increased.
   4211		 */
   4212		if (get_order(s->size) > get_order(s->object_size)) {
   4213			s->flags &= ~DEBUG_METADATA_FLAGS;
   4214			s->offset = 0;
   4215			if (!calculate_sizes(s))
   4216				goto error;
   4217		}
   4218	}
   4219
   4220#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
   4221    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
   4222	if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
   4223		/* Enable fast mode */
   4224		s->flags |= __CMPXCHG_DOUBLE;
   4225#endif
   4226
   4227	/*
   4228	 * The larger the object size is, the more slabs we want on the partial
   4229	 * list to avoid pounding the page allocator excessively.
   4230	 */
   4231	s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
   4232	s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
   4233
   4234	set_cpu_partial(s);
   4235
   4236#ifdef CONFIG_NUMA
   4237	s->remote_node_defrag_ratio = 1000;
   4238#endif
   4239
   4240	/* Initialize the pre-computed randomized freelist if slab is up */
   4241	if (slab_state >= UP) {
   4242		if (init_cache_random_seq(s))
   4243			goto error;
   4244	}
   4245
   4246	if (!init_kmem_cache_nodes(s))
   4247		goto error;
   4248
   4249	if (alloc_kmem_cache_cpus(s))
   4250		return 0;
   4251
   4252error:
   4253	__kmem_cache_release(s);
   4254	return -EINVAL;
   4255}
   4256
   4257static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
   4258			      const char *text)
   4259{
   4260#ifdef CONFIG_SLUB_DEBUG
   4261	void *addr = slab_address(slab);
   4262	unsigned long flags;
   4263	unsigned long *map;
   4264	void *p;
   4265
   4266	slab_err(s, slab, text, s->name);
   4267	slab_lock(slab, &flags);
   4268
   4269	map = get_map(s, slab);
   4270	for_each_object(p, s, addr, slab->objects) {
   4271
   4272		if (!test_bit(__obj_to_index(s, addr, p), map)) {
   4273			pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
   4274			print_tracking(s, p);
   4275		}
   4276	}
   4277	put_map(map);
   4278	slab_unlock(slab, &flags);
   4279#endif
   4280}
   4281
   4282/*
   4283 * Attempt to free all partial slabs on a node.
   4284 * This is called from __kmem_cache_shutdown(). We must take list_lock
   4285 * because sysfs file might still access partial list after the shutdowning.
   4286 */
   4287static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
   4288{
   4289	LIST_HEAD(discard);
   4290	struct slab *slab, *h;
   4291
   4292	BUG_ON(irqs_disabled());
   4293	spin_lock_irq(&n->list_lock);
   4294	list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
   4295		if (!slab->inuse) {
   4296			remove_partial(n, slab);
   4297			list_add(&slab->slab_list, &discard);
   4298		} else {
   4299			list_slab_objects(s, slab,
   4300			  "Objects remaining in %s on __kmem_cache_shutdown()");
   4301		}
   4302	}
   4303	spin_unlock_irq(&n->list_lock);
   4304
   4305	list_for_each_entry_safe(slab, h, &discard, slab_list)
   4306		discard_slab(s, slab);
   4307}
   4308
   4309bool __kmem_cache_empty(struct kmem_cache *s)
   4310{
   4311	int node;
   4312	struct kmem_cache_node *n;
   4313
   4314	for_each_kmem_cache_node(s, node, n)
   4315		if (n->nr_partial || slabs_node(s, node))
   4316			return false;
   4317	return true;
   4318}
   4319
   4320/*
   4321 * Release all resources used by a slab cache.
   4322 */
   4323int __kmem_cache_shutdown(struct kmem_cache *s)
   4324{
   4325	int node;
   4326	struct kmem_cache_node *n;
   4327
   4328	flush_all_cpus_locked(s);
   4329	/* Attempt to free all objects */
   4330	for_each_kmem_cache_node(s, node, n) {
   4331		free_partial(s, n);
   4332		if (n->nr_partial || slabs_node(s, node))
   4333			return 1;
   4334	}
   4335	return 0;
   4336}
   4337
   4338#ifdef CONFIG_PRINTK
   4339void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
   4340{
   4341	void *base;
   4342	int __maybe_unused i;
   4343	unsigned int objnr;
   4344	void *objp;
   4345	void *objp0;
   4346	struct kmem_cache *s = slab->slab_cache;
   4347	struct track __maybe_unused *trackp;
   4348
   4349	kpp->kp_ptr = object;
   4350	kpp->kp_slab = slab;
   4351	kpp->kp_slab_cache = s;
   4352	base = slab_address(slab);
   4353	objp0 = kasan_reset_tag(object);
   4354#ifdef CONFIG_SLUB_DEBUG
   4355	objp = restore_red_left(s, objp0);
   4356#else
   4357	objp = objp0;
   4358#endif
   4359	objnr = obj_to_index(s, slab, objp);
   4360	kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
   4361	objp = base + s->size * objnr;
   4362	kpp->kp_objp = objp;
   4363	if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
   4364			 || (objp - base) % s->size) ||
   4365	    !(s->flags & SLAB_STORE_USER))
   4366		return;
   4367#ifdef CONFIG_SLUB_DEBUG
   4368	objp = fixup_red_left(s, objp);
   4369	trackp = get_track(s, objp, TRACK_ALLOC);
   4370	kpp->kp_ret = (void *)trackp->addr;
   4371#ifdef CONFIG_STACKDEPOT
   4372	{
   4373		depot_stack_handle_t handle;
   4374		unsigned long *entries;
   4375		unsigned int nr_entries;
   4376
   4377		handle = READ_ONCE(trackp->handle);
   4378		if (handle) {
   4379			nr_entries = stack_depot_fetch(handle, &entries);
   4380			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
   4381				kpp->kp_stack[i] = (void *)entries[i];
   4382		}
   4383
   4384		trackp = get_track(s, objp, TRACK_FREE);
   4385		handle = READ_ONCE(trackp->handle);
   4386		if (handle) {
   4387			nr_entries = stack_depot_fetch(handle, &entries);
   4388			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
   4389				kpp->kp_free_stack[i] = (void *)entries[i];
   4390		}
   4391	}
   4392#endif
   4393#endif
   4394}
   4395#endif
   4396
   4397/********************************************************************
   4398 *		Kmalloc subsystem
   4399 *******************************************************************/
   4400
   4401static int __init setup_slub_min_order(char *str)
   4402{
   4403	get_option(&str, (int *)&slub_min_order);
   4404
   4405	return 1;
   4406}
   4407
   4408__setup("slub_min_order=", setup_slub_min_order);
   4409
   4410static int __init setup_slub_max_order(char *str)
   4411{
   4412	get_option(&str, (int *)&slub_max_order);
   4413	slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
   4414
   4415	return 1;
   4416}
   4417
   4418__setup("slub_max_order=", setup_slub_max_order);
   4419
   4420static int __init setup_slub_min_objects(char *str)
   4421{
   4422	get_option(&str, (int *)&slub_min_objects);
   4423
   4424	return 1;
   4425}
   4426
   4427__setup("slub_min_objects=", setup_slub_min_objects);
   4428
   4429void *__kmalloc(size_t size, gfp_t flags)
   4430{
   4431	struct kmem_cache *s;
   4432	void *ret;
   4433
   4434	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
   4435		return kmalloc_large(size, flags);
   4436
   4437	s = kmalloc_slab(size, flags);
   4438
   4439	if (unlikely(ZERO_OR_NULL_PTR(s)))
   4440		return s;
   4441
   4442	ret = slab_alloc(s, NULL, flags, _RET_IP_, size);
   4443
   4444	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
   4445
   4446	ret = kasan_kmalloc(s, ret, size, flags);
   4447
   4448	return ret;
   4449}
   4450EXPORT_SYMBOL(__kmalloc);
   4451
   4452#ifdef CONFIG_NUMA
   4453static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
   4454{
   4455	struct page *page;
   4456	void *ptr = NULL;
   4457	unsigned int order = get_order(size);
   4458
   4459	flags |= __GFP_COMP;
   4460	page = alloc_pages_node(node, flags, order);
   4461	if (page) {
   4462		ptr = page_address(page);
   4463		mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
   4464				      PAGE_SIZE << order);
   4465	}
   4466
   4467	return kmalloc_large_node_hook(ptr, size, flags);
   4468}
   4469
   4470void *__kmalloc_node(size_t size, gfp_t flags, int node)
   4471{
   4472	struct kmem_cache *s;
   4473	void *ret;
   4474
   4475	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
   4476		ret = kmalloc_large_node(size, flags, node);
   4477
   4478		trace_kmalloc_node(_RET_IP_, ret,
   4479				   size, PAGE_SIZE << get_order(size),
   4480				   flags, node);
   4481
   4482		return ret;
   4483	}
   4484
   4485	s = kmalloc_slab(size, flags);
   4486
   4487	if (unlikely(ZERO_OR_NULL_PTR(s)))
   4488		return s;
   4489
   4490	ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size);
   4491
   4492	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
   4493
   4494	ret = kasan_kmalloc(s, ret, size, flags);
   4495
   4496	return ret;
   4497}
   4498EXPORT_SYMBOL(__kmalloc_node);
   4499#endif	/* CONFIG_NUMA */
   4500
   4501#ifdef CONFIG_HARDENED_USERCOPY
   4502/*
   4503 * Rejects incorrectly sized objects and objects that are to be copied
   4504 * to/from userspace but do not fall entirely within the containing slab
   4505 * cache's usercopy region.
   4506 *
   4507 * Returns NULL if check passes, otherwise const char * to name of cache
   4508 * to indicate an error.
   4509 */
   4510void __check_heap_object(const void *ptr, unsigned long n,
   4511			 const struct slab *slab, bool to_user)
   4512{
   4513	struct kmem_cache *s;
   4514	unsigned int offset;
   4515	bool is_kfence = is_kfence_address(ptr);
   4516
   4517	ptr = kasan_reset_tag(ptr);
   4518
   4519	/* Find object and usable object size. */
   4520	s = slab->slab_cache;
   4521
   4522	/* Reject impossible pointers. */
   4523	if (ptr < slab_address(slab))
   4524		usercopy_abort("SLUB object not in SLUB page?!", NULL,
   4525			       to_user, 0, n);
   4526
   4527	/* Find offset within object. */
   4528	if (is_kfence)
   4529		offset = ptr - kfence_object_start(ptr);
   4530	else
   4531		offset = (ptr - slab_address(slab)) % s->size;
   4532
   4533	/* Adjust for redzone and reject if within the redzone. */
   4534	if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
   4535		if (offset < s->red_left_pad)
   4536			usercopy_abort("SLUB object in left red zone",
   4537				       s->name, to_user, offset, n);
   4538		offset -= s->red_left_pad;
   4539	}
   4540
   4541	/* Allow address range falling entirely within usercopy region. */
   4542	if (offset >= s->useroffset &&
   4543	    offset - s->useroffset <= s->usersize &&
   4544	    n <= s->useroffset - offset + s->usersize)
   4545		return;
   4546
   4547	usercopy_abort("SLUB object", s->name, to_user, offset, n);
   4548}
   4549#endif /* CONFIG_HARDENED_USERCOPY */
   4550
   4551size_t __ksize(const void *object)
   4552{
   4553	struct folio *folio;
   4554
   4555	if (unlikely(object == ZERO_SIZE_PTR))
   4556		return 0;
   4557
   4558	folio = virt_to_folio(object);
   4559
   4560	if (unlikely(!folio_test_slab(folio)))
   4561		return folio_size(folio);
   4562
   4563	return slab_ksize(folio_slab(folio)->slab_cache);
   4564}
   4565EXPORT_SYMBOL(__ksize);
   4566
   4567void kfree(const void *x)
   4568{
   4569	struct folio *folio;
   4570	struct slab *slab;
   4571	void *object = (void *)x;
   4572
   4573	trace_kfree(_RET_IP_, x);
   4574
   4575	if (unlikely(ZERO_OR_NULL_PTR(x)))
   4576		return;
   4577
   4578	folio = virt_to_folio(x);
   4579	if (unlikely(!folio_test_slab(folio))) {
   4580		free_large_kmalloc(folio, object);
   4581		return;
   4582	}
   4583	slab = folio_slab(folio);
   4584	slab_free(slab->slab_cache, slab, object, NULL, 1, _RET_IP_);
   4585}
   4586EXPORT_SYMBOL(kfree);
   4587
   4588#define SHRINK_PROMOTE_MAX 32
   4589
   4590/*
   4591 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
   4592 * up most to the head of the partial lists. New allocations will then
   4593 * fill those up and thus they can be removed from the partial lists.
   4594 *
   4595 * The slabs with the least items are placed last. This results in them
   4596 * being allocated from last increasing the chance that the last objects
   4597 * are freed in them.
   4598 */
   4599static int __kmem_cache_do_shrink(struct kmem_cache *s)
   4600{
   4601	int node;
   4602	int i;
   4603	struct kmem_cache_node *n;
   4604	struct slab *slab;
   4605	struct slab *t;
   4606	struct list_head discard;
   4607	struct list_head promote[SHRINK_PROMOTE_MAX];
   4608	unsigned long flags;
   4609	int ret = 0;
   4610
   4611	for_each_kmem_cache_node(s, node, n) {
   4612		INIT_LIST_HEAD(&discard);
   4613		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
   4614			INIT_LIST_HEAD(promote + i);
   4615
   4616		spin_lock_irqsave(&n->list_lock, flags);
   4617
   4618		/*
   4619		 * Build lists of slabs to discard or promote.
   4620		 *
   4621		 * Note that concurrent frees may occur while we hold the
   4622		 * list_lock. slab->inuse here is the upper limit.
   4623		 */
   4624		list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
   4625			int free = slab->objects - slab->inuse;
   4626
   4627			/* Do not reread slab->inuse */
   4628			barrier();
   4629
   4630			/* We do not keep full slabs on the list */
   4631			BUG_ON(free <= 0);
   4632
   4633			if (free == slab->objects) {
   4634				list_move(&slab->slab_list, &discard);
   4635				n->nr_partial--;
   4636			} else if (free <= SHRINK_PROMOTE_MAX)
   4637				list_move(&slab->slab_list, promote + free - 1);
   4638		}
   4639
   4640		/*
   4641		 * Promote the slabs filled up most to the head of the
   4642		 * partial list.
   4643		 */
   4644		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
   4645			list_splice(promote + i, &n->partial);
   4646
   4647		spin_unlock_irqrestore(&n->list_lock, flags);
   4648
   4649		/* Release empty slabs */
   4650		list_for_each_entry_safe(slab, t, &discard, slab_list)
   4651			discard_slab(s, slab);
   4652
   4653		if (slabs_node(s, node))
   4654			ret = 1;
   4655	}
   4656
   4657	return ret;
   4658}
   4659
   4660int __kmem_cache_shrink(struct kmem_cache *s)
   4661{
   4662	flush_all(s);
   4663	return __kmem_cache_do_shrink(s);
   4664}
   4665
   4666static int slab_mem_going_offline_callback(void *arg)
   4667{
   4668	struct kmem_cache *s;
   4669
   4670	mutex_lock(&slab_mutex);
   4671	list_for_each_entry(s, &slab_caches, list) {
   4672		flush_all_cpus_locked(s);
   4673		__kmem_cache_do_shrink(s);
   4674	}
   4675	mutex_unlock(&slab_mutex);
   4676
   4677	return 0;
   4678}
   4679
   4680static void slab_mem_offline_callback(void *arg)
   4681{
   4682	struct memory_notify *marg = arg;
   4683	int offline_node;
   4684
   4685	offline_node = marg->status_change_nid_normal;
   4686
   4687	/*
   4688	 * If the node still has available memory. we need kmem_cache_node
   4689	 * for it yet.
   4690	 */
   4691	if (offline_node < 0)
   4692		return;
   4693
   4694	mutex_lock(&slab_mutex);
   4695	node_clear(offline_node, slab_nodes);
   4696	/*
   4697	 * We no longer free kmem_cache_node structures here, as it would be
   4698	 * racy with all get_node() users, and infeasible to protect them with
   4699	 * slab_mutex.
   4700	 */
   4701	mutex_unlock(&slab_mutex);
   4702}
   4703
   4704static int slab_mem_going_online_callback(void *arg)
   4705{
   4706	struct kmem_cache_node *n;
   4707	struct kmem_cache *s;
   4708	struct memory_notify *marg = arg;
   4709	int nid = marg->status_change_nid_normal;
   4710	int ret = 0;
   4711
   4712	/*
   4713	 * If the node's memory is already available, then kmem_cache_node is
   4714	 * already created. Nothing to do.
   4715	 */
   4716	if (nid < 0)
   4717		return 0;
   4718
   4719	/*
   4720	 * We are bringing a node online. No memory is available yet. We must
   4721	 * allocate a kmem_cache_node structure in order to bring the node
   4722	 * online.
   4723	 */
   4724	mutex_lock(&slab_mutex);
   4725	list_for_each_entry(s, &slab_caches, list) {
   4726		/*
   4727		 * The structure may already exist if the node was previously
   4728		 * onlined and offlined.
   4729		 */
   4730		if (get_node(s, nid))
   4731			continue;
   4732		/*
   4733		 * XXX: kmem_cache_alloc_node will fallback to other nodes
   4734		 *      since memory is not yet available from the node that
   4735		 *      is brought up.
   4736		 */
   4737		n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
   4738		if (!n) {
   4739			ret = -ENOMEM;
   4740			goto out;
   4741		}
   4742		init_kmem_cache_node(n);
   4743		s->node[nid] = n;
   4744	}
   4745	/*
   4746	 * Any cache created after this point will also have kmem_cache_node
   4747	 * initialized for the new node.
   4748	 */
   4749	node_set(nid, slab_nodes);
   4750out:
   4751	mutex_unlock(&slab_mutex);
   4752	return ret;
   4753}
   4754
   4755static int slab_memory_callback(struct notifier_block *self,
   4756				unsigned long action, void *arg)
   4757{
   4758	int ret = 0;
   4759
   4760	switch (action) {
   4761	case MEM_GOING_ONLINE:
   4762		ret = slab_mem_going_online_callback(arg);
   4763		break;
   4764	case MEM_GOING_OFFLINE:
   4765		ret = slab_mem_going_offline_callback(arg);
   4766		break;
   4767	case MEM_OFFLINE:
   4768	case MEM_CANCEL_ONLINE:
   4769		slab_mem_offline_callback(arg);
   4770		break;
   4771	case MEM_ONLINE:
   4772	case MEM_CANCEL_OFFLINE:
   4773		break;
   4774	}
   4775	if (ret)
   4776		ret = notifier_from_errno(ret);
   4777	else
   4778		ret = NOTIFY_OK;
   4779	return ret;
   4780}
   4781
   4782static struct notifier_block slab_memory_callback_nb = {
   4783	.notifier_call = slab_memory_callback,
   4784	.priority = SLAB_CALLBACK_PRI,
   4785};
   4786
   4787/********************************************************************
   4788 *			Basic setup of slabs
   4789 *******************************************************************/
   4790
   4791/*
   4792 * Used for early kmem_cache structures that were allocated using
   4793 * the page allocator. Allocate them properly then fix up the pointers
   4794 * that may be pointing to the wrong kmem_cache structure.
   4795 */
   4796
   4797static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
   4798{
   4799	int node;
   4800	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
   4801	struct kmem_cache_node *n;
   4802
   4803	memcpy(s, static_cache, kmem_cache->object_size);
   4804
   4805	/*
   4806	 * This runs very early, and only the boot processor is supposed to be
   4807	 * up.  Even if it weren't true, IRQs are not up so we couldn't fire
   4808	 * IPIs around.
   4809	 */
   4810	__flush_cpu_slab(s, smp_processor_id());
   4811	for_each_kmem_cache_node(s, node, n) {
   4812		struct slab *p;
   4813
   4814		list_for_each_entry(p, &n->partial, slab_list)
   4815			p->slab_cache = s;
   4816
   4817#ifdef CONFIG_SLUB_DEBUG
   4818		list_for_each_entry(p, &n->full, slab_list)
   4819			p->slab_cache = s;
   4820#endif
   4821	}
   4822	list_add(&s->list, &slab_caches);
   4823	return s;
   4824}
   4825
   4826void __init kmem_cache_init(void)
   4827{
   4828	static __initdata struct kmem_cache boot_kmem_cache,
   4829		boot_kmem_cache_node;
   4830	int node;
   4831
   4832	if (debug_guardpage_minorder())
   4833		slub_max_order = 0;
   4834
   4835	/* Print slub debugging pointers without hashing */
   4836	if (__slub_debug_enabled())
   4837		no_hash_pointers_enable(NULL);
   4838
   4839	kmem_cache_node = &boot_kmem_cache_node;
   4840	kmem_cache = &boot_kmem_cache;
   4841
   4842	/*
   4843	 * Initialize the nodemask for which we will allocate per node
   4844	 * structures. Here we don't need taking slab_mutex yet.
   4845	 */
   4846	for_each_node_state(node, N_NORMAL_MEMORY)
   4847		node_set(node, slab_nodes);
   4848
   4849	create_boot_cache(kmem_cache_node, "kmem_cache_node",
   4850		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
   4851
   4852	register_hotmemory_notifier(&slab_memory_callback_nb);
   4853
   4854	/* Able to allocate the per node structures */
   4855	slab_state = PARTIAL;
   4856
   4857	create_boot_cache(kmem_cache, "kmem_cache",
   4858			offsetof(struct kmem_cache, node) +
   4859				nr_node_ids * sizeof(struct kmem_cache_node *),
   4860		       SLAB_HWCACHE_ALIGN, 0, 0);
   4861
   4862	kmem_cache = bootstrap(&boot_kmem_cache);
   4863	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
   4864
   4865	/* Now we can use the kmem_cache to allocate kmalloc slabs */
   4866	setup_kmalloc_cache_index_table();
   4867	create_kmalloc_caches(0);
   4868
   4869	/* Setup random freelists for each cache */
   4870	init_freelist_randomization();
   4871
   4872	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
   4873				  slub_cpu_dead);
   4874
   4875	pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
   4876		cache_line_size(),
   4877		slub_min_order, slub_max_order, slub_min_objects,
   4878		nr_cpu_ids, nr_node_ids);
   4879}
   4880
   4881void __init kmem_cache_init_late(void)
   4882{
   4883}
   4884
   4885struct kmem_cache *
   4886__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
   4887		   slab_flags_t flags, void (*ctor)(void *))
   4888{
   4889	struct kmem_cache *s;
   4890
   4891	s = find_mergeable(size, align, flags, name, ctor);
   4892	if (s) {
   4893		s->refcount++;
   4894
   4895		/*
   4896		 * Adjust the object sizes so that we clear
   4897		 * the complete object on kzalloc.
   4898		 */
   4899		s->object_size = max(s->object_size, size);
   4900		s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
   4901
   4902		if (sysfs_slab_alias(s, name)) {
   4903			s->refcount--;
   4904			s = NULL;
   4905		}
   4906	}
   4907
   4908	return s;
   4909}
   4910
   4911int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
   4912{
   4913	int err;
   4914
   4915	err = kmem_cache_open(s, flags);
   4916	if (err)
   4917		return err;
   4918
   4919	/* Mutex is not taken during early boot */
   4920	if (slab_state <= UP)
   4921		return 0;
   4922
   4923	err = sysfs_slab_add(s);
   4924	if (err) {
   4925		__kmem_cache_release(s);
   4926		return err;
   4927	}
   4928
   4929	if (s->flags & SLAB_STORE_USER)
   4930		debugfs_slab_add(s);
   4931
   4932	return 0;
   4933}
   4934
   4935void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
   4936{
   4937	struct kmem_cache *s;
   4938	void *ret;
   4939
   4940	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
   4941		return kmalloc_large(size, gfpflags);
   4942
   4943	s = kmalloc_slab(size, gfpflags);
   4944
   4945	if (unlikely(ZERO_OR_NULL_PTR(s)))
   4946		return s;
   4947
   4948	ret = slab_alloc(s, NULL, gfpflags, caller, size);
   4949
   4950	/* Honor the call site pointer we received. */
   4951	trace_kmalloc(caller, ret, size, s->size, gfpflags);
   4952
   4953	return ret;
   4954}
   4955EXPORT_SYMBOL(__kmalloc_track_caller);
   4956
   4957#ifdef CONFIG_NUMA
   4958void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
   4959					int node, unsigned long caller)
   4960{
   4961	struct kmem_cache *s;
   4962	void *ret;
   4963
   4964	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
   4965		ret = kmalloc_large_node(size, gfpflags, node);
   4966
   4967		trace_kmalloc_node(caller, ret,
   4968				   size, PAGE_SIZE << get_order(size),
   4969				   gfpflags, node);
   4970
   4971		return ret;
   4972	}
   4973
   4974	s = kmalloc_slab(size, gfpflags);
   4975
   4976	if (unlikely(ZERO_OR_NULL_PTR(s)))
   4977		return s;
   4978
   4979	ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size);
   4980
   4981	/* Honor the call site pointer we received. */
   4982	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
   4983
   4984	return ret;
   4985}
   4986EXPORT_SYMBOL(__kmalloc_node_track_caller);
   4987#endif
   4988
   4989#ifdef CONFIG_SYSFS
   4990static int count_inuse(struct slab *slab)
   4991{
   4992	return slab->inuse;
   4993}
   4994
   4995static int count_total(struct slab *slab)
   4996{
   4997	return slab->objects;
   4998}
   4999#endif
   5000
   5001#ifdef CONFIG_SLUB_DEBUG
   5002static void validate_slab(struct kmem_cache *s, struct slab *slab,
   5003			  unsigned long *obj_map)
   5004{
   5005	void *p;
   5006	void *addr = slab_address(slab);
   5007	unsigned long flags;
   5008
   5009	slab_lock(slab, &flags);
   5010
   5011	if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
   5012		goto unlock;
   5013
   5014	/* Now we know that a valid freelist exists */
   5015	__fill_map(obj_map, s, slab);
   5016	for_each_object(p, s, addr, slab->objects) {
   5017		u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
   5018			 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
   5019
   5020		if (!check_object(s, slab, p, val))
   5021			break;
   5022	}
   5023unlock:
   5024	slab_unlock(slab, &flags);
   5025}
   5026
   5027static int validate_slab_node(struct kmem_cache *s,
   5028		struct kmem_cache_node *n, unsigned long *obj_map)
   5029{
   5030	unsigned long count = 0;
   5031	struct slab *slab;
   5032	unsigned long flags;
   5033
   5034	spin_lock_irqsave(&n->list_lock, flags);
   5035
   5036	list_for_each_entry(slab, &n->partial, slab_list) {
   5037		validate_slab(s, slab, obj_map);
   5038		count++;
   5039	}
   5040	if (count != n->nr_partial) {
   5041		pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
   5042		       s->name, count, n->nr_partial);
   5043		slab_add_kunit_errors();
   5044	}
   5045
   5046	if (!(s->flags & SLAB_STORE_USER))
   5047		goto out;
   5048
   5049	list_for_each_entry(slab, &n->full, slab_list) {
   5050		validate_slab(s, slab, obj_map);
   5051		count++;
   5052	}
   5053	if (count != atomic_long_read(&n->nr_slabs)) {
   5054		pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
   5055		       s->name, count, atomic_long_read(&n->nr_slabs));
   5056		slab_add_kunit_errors();
   5057	}
   5058
   5059out:
   5060	spin_unlock_irqrestore(&n->list_lock, flags);
   5061	return count;
   5062}
   5063
   5064long validate_slab_cache(struct kmem_cache *s)
   5065{
   5066	int node;
   5067	unsigned long count = 0;
   5068	struct kmem_cache_node *n;
   5069	unsigned long *obj_map;
   5070
   5071	obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
   5072	if (!obj_map)
   5073		return -ENOMEM;
   5074
   5075	flush_all(s);
   5076	for_each_kmem_cache_node(s, node, n)
   5077		count += validate_slab_node(s, n, obj_map);
   5078
   5079	bitmap_free(obj_map);
   5080
   5081	return count;
   5082}
   5083EXPORT_SYMBOL(validate_slab_cache);
   5084
   5085#ifdef CONFIG_DEBUG_FS
   5086/*
   5087 * Generate lists of code addresses where slabcache objects are allocated
   5088 * and freed.
   5089 */
   5090
   5091struct location {
   5092	depot_stack_handle_t handle;
   5093	unsigned long count;
   5094	unsigned long addr;
   5095	long long sum_time;
   5096	long min_time;
   5097	long max_time;
   5098	long min_pid;
   5099	long max_pid;
   5100	DECLARE_BITMAP(cpus, NR_CPUS);
   5101	nodemask_t nodes;
   5102};
   5103
   5104struct loc_track {
   5105	unsigned long max;
   5106	unsigned long count;
   5107	struct location *loc;
   5108	loff_t idx;
   5109};
   5110
   5111static struct dentry *slab_debugfs_root;
   5112
   5113static void free_loc_track(struct loc_track *t)
   5114{
   5115	if (t->max)
   5116		free_pages((unsigned long)t->loc,
   5117			get_order(sizeof(struct location) * t->max));
   5118}
   5119
   5120static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
   5121{
   5122	struct location *l;
   5123	int order;
   5124
   5125	order = get_order(sizeof(struct location) * max);
   5126
   5127	l = (void *)__get_free_pages(flags, order);
   5128	if (!l)
   5129		return 0;
   5130
   5131	if (t->count) {
   5132		memcpy(l, t->loc, sizeof(struct location) * t->count);
   5133		free_loc_track(t);
   5134	}
   5135	t->max = max;
   5136	t->loc = l;
   5137	return 1;
   5138}
   5139
   5140static int add_location(struct loc_track *t, struct kmem_cache *s,
   5141				const struct track *track)
   5142{
   5143	long start, end, pos;
   5144	struct location *l;
   5145	unsigned long caddr, chandle;
   5146	unsigned long age = jiffies - track->when;
   5147	depot_stack_handle_t handle = 0;
   5148
   5149#ifdef CONFIG_STACKDEPOT
   5150	handle = READ_ONCE(track->handle);
   5151#endif
   5152	start = -1;
   5153	end = t->count;
   5154
   5155	for ( ; ; ) {
   5156		pos = start + (end - start + 1) / 2;
   5157
   5158		/*
   5159		 * There is nothing at "end". If we end up there
   5160		 * we need to add something to before end.
   5161		 */
   5162		if (pos == end)
   5163			break;
   5164
   5165		caddr = t->loc[pos].addr;
   5166		chandle = t->loc[pos].handle;
   5167		if ((track->addr == caddr) && (handle == chandle)) {
   5168
   5169			l = &t->loc[pos];
   5170			l->count++;
   5171			if (track->when) {
   5172				l->sum_time += age;
   5173				if (age < l->min_time)
   5174					l->min_time = age;
   5175				if (age > l->max_time)
   5176					l->max_time = age;
   5177
   5178				if (track->pid < l->min_pid)
   5179					l->min_pid = track->pid;
   5180				if (track->pid > l->max_pid)
   5181					l->max_pid = track->pid;
   5182
   5183				cpumask_set_cpu(track->cpu,
   5184						to_cpumask(l->cpus));
   5185			}
   5186			node_set(page_to_nid(virt_to_page(track)), l->nodes);
   5187			return 1;
   5188		}
   5189
   5190		if (track->addr < caddr)
   5191			end = pos;
   5192		else if (track->addr == caddr && handle < chandle)
   5193			end = pos;
   5194		else
   5195			start = pos;
   5196	}
   5197
   5198	/*
   5199	 * Not found. Insert new tracking element.
   5200	 */
   5201	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
   5202		return 0;
   5203
   5204	l = t->loc + pos;
   5205	if (pos < t->count)
   5206		memmove(l + 1, l,
   5207			(t->count - pos) * sizeof(struct location));
   5208	t->count++;
   5209	l->count = 1;
   5210	l->addr = track->addr;
   5211	l->sum_time = age;
   5212	l->min_time = age;
   5213	l->max_time = age;
   5214	l->min_pid = track->pid;
   5215	l->max_pid = track->pid;
   5216	l->handle = handle;
   5217	cpumask_clear(to_cpumask(l->cpus));
   5218	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
   5219	nodes_clear(l->nodes);
   5220	node_set(page_to_nid(virt_to_page(track)), l->nodes);
   5221	return 1;
   5222}
   5223
   5224static void process_slab(struct loc_track *t, struct kmem_cache *s,
   5225		struct slab *slab, enum track_item alloc,
   5226		unsigned long *obj_map)
   5227{
   5228	void *addr = slab_address(slab);
   5229	void *p;
   5230
   5231	__fill_map(obj_map, s, slab);
   5232
   5233	for_each_object(p, s, addr, slab->objects)
   5234		if (!test_bit(__obj_to_index(s, addr, p), obj_map))
   5235			add_location(t, s, get_track(s, p, alloc));
   5236}
   5237#endif  /* CONFIG_DEBUG_FS   */
   5238#endif	/* CONFIG_SLUB_DEBUG */
   5239
   5240#ifdef CONFIG_SYSFS
   5241enum slab_stat_type {
   5242	SL_ALL,			/* All slabs */
   5243	SL_PARTIAL,		/* Only partially allocated slabs */
   5244	SL_CPU,			/* Only slabs used for cpu caches */
   5245	SL_OBJECTS,		/* Determine allocated objects not slabs */
   5246	SL_TOTAL		/* Determine object capacity not slabs */
   5247};
   5248
   5249#define SO_ALL		(1 << SL_ALL)
   5250#define SO_PARTIAL	(1 << SL_PARTIAL)
   5251#define SO_CPU		(1 << SL_CPU)
   5252#define SO_OBJECTS	(1 << SL_OBJECTS)
   5253#define SO_TOTAL	(1 << SL_TOTAL)
   5254
   5255static ssize_t show_slab_objects(struct kmem_cache *s,
   5256				 char *buf, unsigned long flags)
   5257{
   5258	unsigned long total = 0;
   5259	int node;
   5260	int x;
   5261	unsigned long *nodes;
   5262	int len = 0;
   5263
   5264	nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
   5265	if (!nodes)
   5266		return -ENOMEM;
   5267
   5268	if (flags & SO_CPU) {
   5269		int cpu;
   5270
   5271		for_each_possible_cpu(cpu) {
   5272			struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
   5273							       cpu);
   5274			int node;
   5275			struct slab *slab;
   5276
   5277			slab = READ_ONCE(c->slab);
   5278			if (!slab)
   5279				continue;
   5280
   5281			node = slab_nid(slab);
   5282			if (flags & SO_TOTAL)
   5283				x = slab->objects;
   5284			else if (flags & SO_OBJECTS)
   5285				x = slab->inuse;
   5286			else
   5287				x = 1;
   5288
   5289			total += x;
   5290			nodes[node] += x;
   5291
   5292#ifdef CONFIG_SLUB_CPU_PARTIAL
   5293			slab = slub_percpu_partial_read_once(c);
   5294			if (slab) {
   5295				node = slab_nid(slab);
   5296				if (flags & SO_TOTAL)
   5297					WARN_ON_ONCE(1);
   5298				else if (flags & SO_OBJECTS)
   5299					WARN_ON_ONCE(1);
   5300				else
   5301					x = slab->slabs;
   5302				total += x;
   5303				nodes[node] += x;
   5304			}
   5305#endif
   5306		}
   5307	}
   5308
   5309	/*
   5310	 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
   5311	 * already held which will conflict with an existing lock order:
   5312	 *
   5313	 * mem_hotplug_lock->slab_mutex->kernfs_mutex
   5314	 *
   5315	 * We don't really need mem_hotplug_lock (to hold off
   5316	 * slab_mem_going_offline_callback) here because slab's memory hot
   5317	 * unplug code doesn't destroy the kmem_cache->node[] data.
   5318	 */
   5319
   5320#ifdef CONFIG_SLUB_DEBUG
   5321	if (flags & SO_ALL) {
   5322		struct kmem_cache_node *n;
   5323
   5324		for_each_kmem_cache_node(s, node, n) {
   5325
   5326			if (flags & SO_TOTAL)
   5327				x = atomic_long_read(&n->total_objects);
   5328			else if (flags & SO_OBJECTS)
   5329				x = atomic_long_read(&n->total_objects) -
   5330					count_partial(n, count_free);
   5331			else
   5332				x = atomic_long_read(&n->nr_slabs);
   5333			total += x;
   5334			nodes[node] += x;
   5335		}
   5336
   5337	} else
   5338#endif
   5339	if (flags & SO_PARTIAL) {
   5340		struct kmem_cache_node *n;
   5341
   5342		for_each_kmem_cache_node(s, node, n) {
   5343			if (flags & SO_TOTAL)
   5344				x = count_partial(n, count_total);
   5345			else if (flags & SO_OBJECTS)
   5346				x = count_partial(n, count_inuse);
   5347			else
   5348				x = n->nr_partial;
   5349			total += x;
   5350			nodes[node] += x;
   5351		}
   5352	}
   5353
   5354	len += sysfs_emit_at(buf, len, "%lu", total);
   5355#ifdef CONFIG_NUMA
   5356	for (node = 0; node < nr_node_ids; node++) {
   5357		if (nodes[node])
   5358			len += sysfs_emit_at(buf, len, " N%d=%lu",
   5359					     node, nodes[node]);
   5360	}
   5361#endif
   5362	len += sysfs_emit_at(buf, len, "\n");
   5363	kfree(nodes);
   5364
   5365	return len;
   5366}
   5367
   5368#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
   5369#define to_slab(n) container_of(n, struct kmem_cache, kobj)
   5370
   5371struct slab_attribute {
   5372	struct attribute attr;
   5373	ssize_t (*show)(struct kmem_cache *s, char *buf);
   5374	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
   5375};
   5376
   5377#define SLAB_ATTR_RO(_name) \
   5378	static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
   5379
   5380#define SLAB_ATTR(_name) \
   5381	static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
   5382
   5383static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
   5384{
   5385	return sysfs_emit(buf, "%u\n", s->size);
   5386}
   5387SLAB_ATTR_RO(slab_size);
   5388
   5389static ssize_t align_show(struct kmem_cache *s, char *buf)
   5390{
   5391	return sysfs_emit(buf, "%u\n", s->align);
   5392}
   5393SLAB_ATTR_RO(align);
   5394
   5395static ssize_t object_size_show(struct kmem_cache *s, char *buf)
   5396{
   5397	return sysfs_emit(buf, "%u\n", s->object_size);
   5398}
   5399SLAB_ATTR_RO(object_size);
   5400
   5401static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
   5402{
   5403	return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
   5404}
   5405SLAB_ATTR_RO(objs_per_slab);
   5406
   5407static ssize_t order_show(struct kmem_cache *s, char *buf)
   5408{
   5409	return sysfs_emit(buf, "%u\n", oo_order(s->oo));
   5410}
   5411SLAB_ATTR_RO(order);
   5412
   5413static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
   5414{
   5415	return sysfs_emit(buf, "%lu\n", s->min_partial);
   5416}
   5417
   5418static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
   5419				 size_t length)
   5420{
   5421	unsigned long min;
   5422	int err;
   5423
   5424	err = kstrtoul(buf, 10, &min);
   5425	if (err)
   5426		return err;
   5427
   5428	s->min_partial = min;
   5429	return length;
   5430}
   5431SLAB_ATTR(min_partial);
   5432
   5433static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
   5434{
   5435	unsigned int nr_partial = 0;
   5436#ifdef CONFIG_SLUB_CPU_PARTIAL
   5437	nr_partial = s->cpu_partial;
   5438#endif
   5439
   5440	return sysfs_emit(buf, "%u\n", nr_partial);
   5441}
   5442
   5443static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
   5444				 size_t length)
   5445{
   5446	unsigned int objects;
   5447	int err;
   5448
   5449	err = kstrtouint(buf, 10, &objects);
   5450	if (err)
   5451		return err;
   5452	if (objects && !kmem_cache_has_cpu_partial(s))
   5453		return -EINVAL;
   5454
   5455	slub_set_cpu_partial(s, objects);
   5456	flush_all(s);
   5457	return length;
   5458}
   5459SLAB_ATTR(cpu_partial);
   5460
   5461static ssize_t ctor_show(struct kmem_cache *s, char *buf)
   5462{
   5463	if (!s->ctor)
   5464		return 0;
   5465	return sysfs_emit(buf, "%pS\n", s->ctor);
   5466}
   5467SLAB_ATTR_RO(ctor);
   5468
   5469static ssize_t aliases_show(struct kmem_cache *s, char *buf)
   5470{
   5471	return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
   5472}
   5473SLAB_ATTR_RO(aliases);
   5474
   5475static ssize_t partial_show(struct kmem_cache *s, char *buf)
   5476{
   5477	return show_slab_objects(s, buf, SO_PARTIAL);
   5478}
   5479SLAB_ATTR_RO(partial);
   5480
   5481static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
   5482{
   5483	return show_slab_objects(s, buf, SO_CPU);
   5484}
   5485SLAB_ATTR_RO(cpu_slabs);
   5486
   5487static ssize_t objects_show(struct kmem_cache *s, char *buf)
   5488{
   5489	return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
   5490}
   5491SLAB_ATTR_RO(objects);
   5492
   5493static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
   5494{
   5495	return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
   5496}
   5497SLAB_ATTR_RO(objects_partial);
   5498
   5499static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
   5500{
   5501	int objects = 0;
   5502	int slabs = 0;
   5503	int cpu __maybe_unused;
   5504	int len = 0;
   5505
   5506#ifdef CONFIG_SLUB_CPU_PARTIAL
   5507	for_each_online_cpu(cpu) {
   5508		struct slab *slab;
   5509
   5510		slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
   5511
   5512		if (slab)
   5513			slabs += slab->slabs;
   5514	}
   5515#endif
   5516
   5517	/* Approximate half-full slabs, see slub_set_cpu_partial() */
   5518	objects = (slabs * oo_objects(s->oo)) / 2;
   5519	len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
   5520
   5521#if defined(CONFIG_SLUB_CPU_PARTIAL) && defined(CONFIG_SMP)
   5522	for_each_online_cpu(cpu) {
   5523		struct slab *slab;
   5524
   5525		slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
   5526		if (slab) {
   5527			slabs = READ_ONCE(slab->slabs);
   5528			objects = (slabs * oo_objects(s->oo)) / 2;
   5529			len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
   5530					     cpu, objects, slabs);
   5531		}
   5532	}
   5533#endif
   5534	len += sysfs_emit_at(buf, len, "\n");
   5535
   5536	return len;
   5537}
   5538SLAB_ATTR_RO(slabs_cpu_partial);
   5539
   5540static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
   5541{
   5542	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
   5543}
   5544SLAB_ATTR_RO(reclaim_account);
   5545
   5546static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
   5547{
   5548	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
   5549}
   5550SLAB_ATTR_RO(hwcache_align);
   5551
   5552#ifdef CONFIG_ZONE_DMA
   5553static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
   5554{
   5555	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
   5556}
   5557SLAB_ATTR_RO(cache_dma);
   5558#endif
   5559
   5560static ssize_t usersize_show(struct kmem_cache *s, char *buf)
   5561{
   5562	return sysfs_emit(buf, "%u\n", s->usersize);
   5563}
   5564SLAB_ATTR_RO(usersize);
   5565
   5566static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
   5567{
   5568	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
   5569}
   5570SLAB_ATTR_RO(destroy_by_rcu);
   5571
   5572#ifdef CONFIG_SLUB_DEBUG
   5573static ssize_t slabs_show(struct kmem_cache *s, char *buf)
   5574{
   5575	return show_slab_objects(s, buf, SO_ALL);
   5576}
   5577SLAB_ATTR_RO(slabs);
   5578
   5579static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
   5580{
   5581	return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
   5582}
   5583SLAB_ATTR_RO(total_objects);
   5584
   5585static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
   5586{
   5587	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
   5588}
   5589SLAB_ATTR_RO(sanity_checks);
   5590
   5591static ssize_t trace_show(struct kmem_cache *s, char *buf)
   5592{
   5593	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
   5594}
   5595SLAB_ATTR_RO(trace);
   5596
   5597static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
   5598{
   5599	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
   5600}
   5601
   5602SLAB_ATTR_RO(red_zone);
   5603
   5604static ssize_t poison_show(struct kmem_cache *s, char *buf)
   5605{
   5606	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
   5607}
   5608
   5609SLAB_ATTR_RO(poison);
   5610
   5611static ssize_t store_user_show(struct kmem_cache *s, char *buf)
   5612{
   5613	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
   5614}
   5615
   5616SLAB_ATTR_RO(store_user);
   5617
   5618static ssize_t validate_show(struct kmem_cache *s, char *buf)
   5619{
   5620	return 0;
   5621}
   5622
   5623static ssize_t validate_store(struct kmem_cache *s,
   5624			const char *buf, size_t length)
   5625{
   5626	int ret = -EINVAL;
   5627
   5628	if (buf[0] == '1') {
   5629		ret = validate_slab_cache(s);
   5630		if (ret >= 0)
   5631			ret = length;
   5632	}
   5633	return ret;
   5634}
   5635SLAB_ATTR(validate);
   5636
   5637#endif /* CONFIG_SLUB_DEBUG */
   5638
   5639#ifdef CONFIG_FAILSLAB
   5640static ssize_t failslab_show(struct kmem_cache *s, char *buf)
   5641{
   5642	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
   5643}
   5644SLAB_ATTR_RO(failslab);
   5645#endif
   5646
   5647static ssize_t shrink_show(struct kmem_cache *s, char *buf)
   5648{
   5649	return 0;
   5650}
   5651
   5652static ssize_t shrink_store(struct kmem_cache *s,
   5653			const char *buf, size_t length)
   5654{
   5655	if (buf[0] == '1')
   5656		kmem_cache_shrink(s);
   5657	else
   5658		return -EINVAL;
   5659	return length;
   5660}
   5661SLAB_ATTR(shrink);
   5662
   5663#ifdef CONFIG_NUMA
   5664static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
   5665{
   5666	return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
   5667}
   5668
   5669static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
   5670				const char *buf, size_t length)
   5671{
   5672	unsigned int ratio;
   5673	int err;
   5674
   5675	err = kstrtouint(buf, 10, &ratio);
   5676	if (err)
   5677		return err;
   5678	if (ratio > 100)
   5679		return -ERANGE;
   5680
   5681	s->remote_node_defrag_ratio = ratio * 10;
   5682
   5683	return length;
   5684}
   5685SLAB_ATTR(remote_node_defrag_ratio);
   5686#endif
   5687
   5688#ifdef CONFIG_SLUB_STATS
   5689static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
   5690{
   5691	unsigned long sum  = 0;
   5692	int cpu;
   5693	int len = 0;
   5694	int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
   5695
   5696	if (!data)
   5697		return -ENOMEM;
   5698
   5699	for_each_online_cpu(cpu) {
   5700		unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
   5701
   5702		data[cpu] = x;
   5703		sum += x;
   5704	}
   5705
   5706	len += sysfs_emit_at(buf, len, "%lu", sum);
   5707
   5708#ifdef CONFIG_SMP
   5709	for_each_online_cpu(cpu) {
   5710		if (data[cpu])
   5711			len += sysfs_emit_at(buf, len, " C%d=%u",
   5712					     cpu, data[cpu]);
   5713	}
   5714#endif
   5715	kfree(data);
   5716	len += sysfs_emit_at(buf, len, "\n");
   5717
   5718	return len;
   5719}
   5720
   5721static void clear_stat(struct kmem_cache *s, enum stat_item si)
   5722{
   5723	int cpu;
   5724
   5725	for_each_online_cpu(cpu)
   5726		per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
   5727}
   5728
   5729#define STAT_ATTR(si, text) 					\
   5730static ssize_t text##_show(struct kmem_cache *s, char *buf)	\
   5731{								\
   5732	return show_stat(s, buf, si);				\
   5733}								\
   5734static ssize_t text##_store(struct kmem_cache *s,		\
   5735				const char *buf, size_t length)	\
   5736{								\
   5737	if (buf[0] != '0')					\
   5738		return -EINVAL;					\
   5739	clear_stat(s, si);					\
   5740	return length;						\
   5741}								\
   5742SLAB_ATTR(text);						\
   5743
   5744STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
   5745STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
   5746STAT_ATTR(FREE_FASTPATH, free_fastpath);
   5747STAT_ATTR(FREE_SLOWPATH, free_slowpath);
   5748STAT_ATTR(FREE_FROZEN, free_frozen);
   5749STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
   5750STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
   5751STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
   5752STAT_ATTR(ALLOC_SLAB, alloc_slab);
   5753STAT_ATTR(ALLOC_REFILL, alloc_refill);
   5754STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
   5755STAT_ATTR(FREE_SLAB, free_slab);
   5756STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
   5757STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
   5758STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
   5759STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
   5760STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
   5761STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
   5762STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
   5763STAT_ATTR(ORDER_FALLBACK, order_fallback);
   5764STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
   5765STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
   5766STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
   5767STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
   5768STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
   5769STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
   5770#endif	/* CONFIG_SLUB_STATS */
   5771
   5772static struct attribute *slab_attrs[] = {
   5773	&slab_size_attr.attr,
   5774	&object_size_attr.attr,
   5775	&objs_per_slab_attr.attr,
   5776	&order_attr.attr,
   5777	&min_partial_attr.attr,
   5778	&cpu_partial_attr.attr,
   5779	&objects_attr.attr,
   5780	&objects_partial_attr.attr,
   5781	&partial_attr.attr,
   5782	&cpu_slabs_attr.attr,
   5783	&ctor_attr.attr,
   5784	&aliases_attr.attr,
   5785	&align_attr.attr,
   5786	&hwcache_align_attr.attr,
   5787	&reclaim_account_attr.attr,
   5788	&destroy_by_rcu_attr.attr,
   5789	&shrink_attr.attr,
   5790	&slabs_cpu_partial_attr.attr,
   5791#ifdef CONFIG_SLUB_DEBUG
   5792	&total_objects_attr.attr,
   5793	&slabs_attr.attr,
   5794	&sanity_checks_attr.attr,
   5795	&trace_attr.attr,
   5796	&red_zone_attr.attr,
   5797	&poison_attr.attr,
   5798	&store_user_attr.attr,
   5799	&validate_attr.attr,
   5800#endif
   5801#ifdef CONFIG_ZONE_DMA
   5802	&cache_dma_attr.attr,
   5803#endif
   5804#ifdef CONFIG_NUMA
   5805	&remote_node_defrag_ratio_attr.attr,
   5806#endif
   5807#ifdef CONFIG_SLUB_STATS
   5808	&alloc_fastpath_attr.attr,
   5809	&alloc_slowpath_attr.attr,
   5810	&free_fastpath_attr.attr,
   5811	&free_slowpath_attr.attr,
   5812	&free_frozen_attr.attr,
   5813	&free_add_partial_attr.attr,
   5814	&free_remove_partial_attr.attr,
   5815	&alloc_from_partial_attr.attr,
   5816	&alloc_slab_attr.attr,
   5817	&alloc_refill_attr.attr,
   5818	&alloc_node_mismatch_attr.attr,
   5819	&free_slab_attr.attr,
   5820	&cpuslab_flush_attr.attr,
   5821	&deactivate_full_attr.attr,
   5822	&deactivate_empty_attr.attr,
   5823	&deactivate_to_head_attr.attr,
   5824	&deactivate_to_tail_attr.attr,
   5825	&deactivate_remote_frees_attr.attr,
   5826	&deactivate_bypass_attr.attr,
   5827	&order_fallback_attr.attr,
   5828	&cmpxchg_double_fail_attr.attr,
   5829	&cmpxchg_double_cpu_fail_attr.attr,
   5830	&cpu_partial_alloc_attr.attr,
   5831	&cpu_partial_free_attr.attr,
   5832	&cpu_partial_node_attr.attr,
   5833	&cpu_partial_drain_attr.attr,
   5834#endif
   5835#ifdef CONFIG_FAILSLAB
   5836	&failslab_attr.attr,
   5837#endif
   5838	&usersize_attr.attr,
   5839
   5840	NULL
   5841};
   5842
   5843static const struct attribute_group slab_attr_group = {
   5844	.attrs = slab_attrs,
   5845};
   5846
   5847static ssize_t slab_attr_show(struct kobject *kobj,
   5848				struct attribute *attr,
   5849				char *buf)
   5850{
   5851	struct slab_attribute *attribute;
   5852	struct kmem_cache *s;
   5853	int err;
   5854
   5855	attribute = to_slab_attr(attr);
   5856	s = to_slab(kobj);
   5857
   5858	if (!attribute->show)
   5859		return -EIO;
   5860
   5861	err = attribute->show(s, buf);
   5862
   5863	return err;
   5864}
   5865
   5866static ssize_t slab_attr_store(struct kobject *kobj,
   5867				struct attribute *attr,
   5868				const char *buf, size_t len)
   5869{
   5870	struct slab_attribute *attribute;
   5871	struct kmem_cache *s;
   5872	int err;
   5873
   5874	attribute = to_slab_attr(attr);
   5875	s = to_slab(kobj);
   5876
   5877	if (!attribute->store)
   5878		return -EIO;
   5879
   5880	err = attribute->store(s, buf, len);
   5881	return err;
   5882}
   5883
   5884static void kmem_cache_release(struct kobject *k)
   5885{
   5886	slab_kmem_cache_release(to_slab(k));
   5887}
   5888
   5889static const struct sysfs_ops slab_sysfs_ops = {
   5890	.show = slab_attr_show,
   5891	.store = slab_attr_store,
   5892};
   5893
   5894static struct kobj_type slab_ktype = {
   5895	.sysfs_ops = &slab_sysfs_ops,
   5896	.release = kmem_cache_release,
   5897};
   5898
   5899static struct kset *slab_kset;
   5900
   5901static inline struct kset *cache_kset(struct kmem_cache *s)
   5902{
   5903	return slab_kset;
   5904}
   5905
   5906#define ID_STR_LENGTH 64
   5907
   5908/* Create a unique string id for a slab cache:
   5909 *
   5910 * Format	:[flags-]size
   5911 */
   5912static char *create_unique_id(struct kmem_cache *s)
   5913{
   5914	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
   5915	char *p = name;
   5916
   5917	BUG_ON(!name);
   5918
   5919	*p++ = ':';
   5920	/*
   5921	 * First flags affecting slabcache operations. We will only
   5922	 * get here for aliasable slabs so we do not need to support
   5923	 * too many flags. The flags here must cover all flags that
   5924	 * are matched during merging to guarantee that the id is
   5925	 * unique.
   5926	 */
   5927	if (s->flags & SLAB_CACHE_DMA)
   5928		*p++ = 'd';
   5929	if (s->flags & SLAB_CACHE_DMA32)
   5930		*p++ = 'D';
   5931	if (s->flags & SLAB_RECLAIM_ACCOUNT)
   5932		*p++ = 'a';
   5933	if (s->flags & SLAB_CONSISTENCY_CHECKS)
   5934		*p++ = 'F';
   5935	if (s->flags & SLAB_ACCOUNT)
   5936		*p++ = 'A';
   5937	if (p != name + 1)
   5938		*p++ = '-';
   5939	p += sprintf(p, "%07u", s->size);
   5940
   5941	BUG_ON(p > name + ID_STR_LENGTH - 1);
   5942	return name;
   5943}
   5944
   5945static int sysfs_slab_add(struct kmem_cache *s)
   5946{
   5947	int err;
   5948	const char *name;
   5949	struct kset *kset = cache_kset(s);
   5950	int unmergeable = slab_unmergeable(s);
   5951
   5952	if (!kset) {
   5953		kobject_init(&s->kobj, &slab_ktype);
   5954		return 0;
   5955	}
   5956
   5957	if (!unmergeable && disable_higher_order_debug &&
   5958			(slub_debug & DEBUG_METADATA_FLAGS))
   5959		unmergeable = 1;
   5960
   5961	if (unmergeable) {
   5962		/*
   5963		 * Slabcache can never be merged so we can use the name proper.
   5964		 * This is typically the case for debug situations. In that
   5965		 * case we can catch duplicate names easily.
   5966		 */
   5967		sysfs_remove_link(&slab_kset->kobj, s->name);
   5968		name = s->name;
   5969	} else {
   5970		/*
   5971		 * Create a unique name for the slab as a target
   5972		 * for the symlinks.
   5973		 */
   5974		name = create_unique_id(s);
   5975	}
   5976
   5977	s->kobj.kset = kset;
   5978	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
   5979	if (err)
   5980		goto out;
   5981
   5982	err = sysfs_create_group(&s->kobj, &slab_attr_group);
   5983	if (err)
   5984		goto out_del_kobj;
   5985
   5986	if (!unmergeable) {
   5987		/* Setup first alias */
   5988		sysfs_slab_alias(s, s->name);
   5989	}
   5990out:
   5991	if (!unmergeable)
   5992		kfree(name);
   5993	return err;
   5994out_del_kobj:
   5995	kobject_del(&s->kobj);
   5996	goto out;
   5997}
   5998
   5999void sysfs_slab_unlink(struct kmem_cache *s)
   6000{
   6001	if (slab_state >= FULL)
   6002		kobject_del(&s->kobj);
   6003}
   6004
   6005void sysfs_slab_release(struct kmem_cache *s)
   6006{
   6007	if (slab_state >= FULL)
   6008		kobject_put(&s->kobj);
   6009}
   6010
   6011/*
   6012 * Need to buffer aliases during bootup until sysfs becomes
   6013 * available lest we lose that information.
   6014 */
   6015struct saved_alias {
   6016	struct kmem_cache *s;
   6017	const char *name;
   6018	struct saved_alias *next;
   6019};
   6020
   6021static struct saved_alias *alias_list;
   6022
   6023static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
   6024{
   6025	struct saved_alias *al;
   6026
   6027	if (slab_state == FULL) {
   6028		/*
   6029		 * If we have a leftover link then remove it.
   6030		 */
   6031		sysfs_remove_link(&slab_kset->kobj, name);
   6032		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
   6033	}
   6034
   6035	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
   6036	if (!al)
   6037		return -ENOMEM;
   6038
   6039	al->s = s;
   6040	al->name = name;
   6041	al->next = alias_list;
   6042	alias_list = al;
   6043	return 0;
   6044}
   6045
   6046static int __init slab_sysfs_init(void)
   6047{
   6048	struct kmem_cache *s;
   6049	int err;
   6050
   6051	mutex_lock(&slab_mutex);
   6052
   6053	slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
   6054	if (!slab_kset) {
   6055		mutex_unlock(&slab_mutex);
   6056		pr_err("Cannot register slab subsystem.\n");
   6057		return -ENOSYS;
   6058	}
   6059
   6060	slab_state = FULL;
   6061
   6062	list_for_each_entry(s, &slab_caches, list) {
   6063		err = sysfs_slab_add(s);
   6064		if (err)
   6065			pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
   6066			       s->name);
   6067	}
   6068
   6069	while (alias_list) {
   6070		struct saved_alias *al = alias_list;
   6071
   6072		alias_list = alias_list->next;
   6073		err = sysfs_slab_alias(al->s, al->name);
   6074		if (err)
   6075			pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
   6076			       al->name);
   6077		kfree(al);
   6078	}
   6079
   6080	mutex_unlock(&slab_mutex);
   6081	return 0;
   6082}
   6083
   6084__initcall(slab_sysfs_init);
   6085#endif /* CONFIG_SYSFS */
   6086
   6087#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
   6088static int slab_debugfs_show(struct seq_file *seq, void *v)
   6089{
   6090	struct loc_track *t = seq->private;
   6091	struct location *l;
   6092	unsigned long idx;
   6093
   6094	idx = (unsigned long) t->idx;
   6095	if (idx < t->count) {
   6096		l = &t->loc[idx];
   6097
   6098		seq_printf(seq, "%7ld ", l->count);
   6099
   6100		if (l->addr)
   6101			seq_printf(seq, "%pS", (void *)l->addr);
   6102		else
   6103			seq_puts(seq, "<not-available>");
   6104
   6105		if (l->sum_time != l->min_time) {
   6106			seq_printf(seq, " age=%ld/%llu/%ld",
   6107				l->min_time, div_u64(l->sum_time, l->count),
   6108				l->max_time);
   6109		} else
   6110			seq_printf(seq, " age=%ld", l->min_time);
   6111
   6112		if (l->min_pid != l->max_pid)
   6113			seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
   6114		else
   6115			seq_printf(seq, " pid=%ld",
   6116				l->min_pid);
   6117
   6118		if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
   6119			seq_printf(seq, " cpus=%*pbl",
   6120				 cpumask_pr_args(to_cpumask(l->cpus)));
   6121
   6122		if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
   6123			seq_printf(seq, " nodes=%*pbl",
   6124				 nodemask_pr_args(&l->nodes));
   6125
   6126#ifdef CONFIG_STACKDEPOT
   6127		{
   6128			depot_stack_handle_t handle;
   6129			unsigned long *entries;
   6130			unsigned int nr_entries, j;
   6131
   6132			handle = READ_ONCE(l->handle);
   6133			if (handle) {
   6134				nr_entries = stack_depot_fetch(handle, &entries);
   6135				seq_puts(seq, "\n");
   6136				for (j = 0; j < nr_entries; j++)
   6137					seq_printf(seq, "        %pS\n", (void *)entries[j]);
   6138			}
   6139		}
   6140#endif
   6141		seq_puts(seq, "\n");
   6142	}
   6143
   6144	if (!idx && !t->count)
   6145		seq_puts(seq, "No data\n");
   6146
   6147	return 0;
   6148}
   6149
   6150static void slab_debugfs_stop(struct seq_file *seq, void *v)
   6151{
   6152}
   6153
   6154static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
   6155{
   6156	struct loc_track *t = seq->private;
   6157
   6158	t->idx = ++(*ppos);
   6159	if (*ppos <= t->count)
   6160		return ppos;
   6161
   6162	return NULL;
   6163}
   6164
   6165static int cmp_loc_by_count(const void *a, const void *b, const void *data)
   6166{
   6167	struct location *loc1 = (struct location *)a;
   6168	struct location *loc2 = (struct location *)b;
   6169
   6170	if (loc1->count > loc2->count)
   6171		return -1;
   6172	else
   6173		return 1;
   6174}
   6175
   6176static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
   6177{
   6178	struct loc_track *t = seq->private;
   6179
   6180	t->idx = *ppos;
   6181	return ppos;
   6182}
   6183
   6184static const struct seq_operations slab_debugfs_sops = {
   6185	.start  = slab_debugfs_start,
   6186	.next   = slab_debugfs_next,
   6187	.stop   = slab_debugfs_stop,
   6188	.show   = slab_debugfs_show,
   6189};
   6190
   6191static int slab_debug_trace_open(struct inode *inode, struct file *filep)
   6192{
   6193
   6194	struct kmem_cache_node *n;
   6195	enum track_item alloc;
   6196	int node;
   6197	struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
   6198						sizeof(struct loc_track));
   6199	struct kmem_cache *s = file_inode(filep)->i_private;
   6200	unsigned long *obj_map;
   6201
   6202	if (!t)
   6203		return -ENOMEM;
   6204
   6205	obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
   6206	if (!obj_map) {
   6207		seq_release_private(inode, filep);
   6208		return -ENOMEM;
   6209	}
   6210
   6211	if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
   6212		alloc = TRACK_ALLOC;
   6213	else
   6214		alloc = TRACK_FREE;
   6215
   6216	if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
   6217		bitmap_free(obj_map);
   6218		seq_release_private(inode, filep);
   6219		return -ENOMEM;
   6220	}
   6221
   6222	for_each_kmem_cache_node(s, node, n) {
   6223		unsigned long flags;
   6224		struct slab *slab;
   6225
   6226		if (!atomic_long_read(&n->nr_slabs))
   6227			continue;
   6228
   6229		spin_lock_irqsave(&n->list_lock, flags);
   6230		list_for_each_entry(slab, &n->partial, slab_list)
   6231			process_slab(t, s, slab, alloc, obj_map);
   6232		list_for_each_entry(slab, &n->full, slab_list)
   6233			process_slab(t, s, slab, alloc, obj_map);
   6234		spin_unlock_irqrestore(&n->list_lock, flags);
   6235	}
   6236
   6237	/* Sort locations by count */
   6238	sort_r(t->loc, t->count, sizeof(struct location),
   6239		cmp_loc_by_count, NULL, NULL);
   6240
   6241	bitmap_free(obj_map);
   6242	return 0;
   6243}
   6244
   6245static int slab_debug_trace_release(struct inode *inode, struct file *file)
   6246{
   6247	struct seq_file *seq = file->private_data;
   6248	struct loc_track *t = seq->private;
   6249
   6250	free_loc_track(t);
   6251	return seq_release_private(inode, file);
   6252}
   6253
   6254static const struct file_operations slab_debugfs_fops = {
   6255	.open    = slab_debug_trace_open,
   6256	.read    = seq_read,
   6257	.llseek  = seq_lseek,
   6258	.release = slab_debug_trace_release,
   6259};
   6260
   6261static void debugfs_slab_add(struct kmem_cache *s)
   6262{
   6263	struct dentry *slab_cache_dir;
   6264
   6265	if (unlikely(!slab_debugfs_root))
   6266		return;
   6267
   6268	slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
   6269
   6270	debugfs_create_file("alloc_traces", 0400,
   6271		slab_cache_dir, s, &slab_debugfs_fops);
   6272
   6273	debugfs_create_file("free_traces", 0400,
   6274		slab_cache_dir, s, &slab_debugfs_fops);
   6275}
   6276
   6277void debugfs_slab_release(struct kmem_cache *s)
   6278{
   6279	debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root));
   6280}
   6281
   6282static int __init slab_debugfs_init(void)
   6283{
   6284	struct kmem_cache *s;
   6285
   6286	slab_debugfs_root = debugfs_create_dir("slab", NULL);
   6287
   6288	list_for_each_entry(s, &slab_caches, list)
   6289		if (s->flags & SLAB_STORE_USER)
   6290			debugfs_slab_add(s);
   6291
   6292	return 0;
   6293
   6294}
   6295__initcall(slab_debugfs_init);
   6296#endif
   6297/*
   6298 * The /proc/slabinfo ABI
   6299 */
   6300#ifdef CONFIG_SLUB_DEBUG
   6301void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
   6302{
   6303	unsigned long nr_slabs = 0;
   6304	unsigned long nr_objs = 0;
   6305	unsigned long nr_free = 0;
   6306	int node;
   6307	struct kmem_cache_node *n;
   6308
   6309	for_each_kmem_cache_node(s, node, n) {
   6310		nr_slabs += node_nr_slabs(n);
   6311		nr_objs += node_nr_objs(n);
   6312		nr_free += count_partial(n, count_free);
   6313	}
   6314
   6315	sinfo->active_objs = nr_objs - nr_free;
   6316	sinfo->num_objs = nr_objs;
   6317	sinfo->active_slabs = nr_slabs;
   6318	sinfo->num_slabs = nr_slabs;
   6319	sinfo->objects_per_slab = oo_objects(s->oo);
   6320	sinfo->cache_order = oo_order(s->oo);
   6321}
   6322
   6323void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
   6324{
   6325}
   6326
   6327ssize_t slabinfo_write(struct file *file, const char __user *buffer,
   6328		       size_t count, loff_t *ppos)
   6329{
   6330	return -EIO;
   6331}
   6332#endif /* CONFIG_SLUB_DEBUG */