slab.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
slab.c (108433B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * linux/mm/slab.c
      4 * Written by Mark Hemment, 1996/97.
      5 * (markhe@nextd.demon.co.uk)
      6 *
      7 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
      8 *
      9 * Major cleanup, different bufctl logic, per-cpu arrays
     10 *	(c) 2000 Manfred Spraul
     11 *
     12 * Cleanup, make the head arrays unconditional, preparation for NUMA
     13 * 	(c) 2002 Manfred Spraul
     14 *
     15 * An implementation of the Slab Allocator as described in outline in;
     16 *	UNIX Internals: The New Frontiers by Uresh Vahalia
     17 *	Pub: Prentice Hall	ISBN 0-13-101908-2
     18 * or with a little more detail in;
     19 *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
     20 *	Jeff Bonwick (Sun Microsystems).
     21 *	Presented at: USENIX Summer 1994 Technical Conference
     22 *
     23 * The memory is organized in caches, one cache for each object type.
     24 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
     25 * Each cache consists out of many slabs (they are small (usually one
     26 * page long) and always contiguous), and each slab contains multiple
     27 * initialized objects.
     28 *
     29 * This means, that your constructor is used only for newly allocated
     30 * slabs and you must pass objects with the same initializations to
     31 * kmem_cache_free.
     32 *
     33 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
     34 * normal). If you need a special memory type, then must create a new
     35 * cache for that memory type.
     36 *
     37 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
     38 *   full slabs with 0 free objects
     39 *   partial slabs
     40 *   empty slabs with no allocated objects
     41 *
     42 * If partial slabs exist, then new allocations come from these slabs,
     43 * otherwise from empty slabs or new slabs are allocated.
     44 *
     45 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
     46 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
     47 *
     48 * Each cache has a short per-cpu head array, most allocs
     49 * and frees go into that array, and if that array overflows, then 1/2
     50 * of the entries in the array are given back into the global cache.
     51 * The head array is strictly LIFO and should improve the cache hit rates.
     52 * On SMP, it additionally reduces the spinlock operations.
     53 *
     54 * The c_cpuarray may not be read with enabled local interrupts -
     55 * it's changed with a smp_call_function().
     56 *
     57 * SMP synchronization:
     58 *  constructors and destructors are called without any locking.
     59 *  Several members in struct kmem_cache and struct slab never change, they
     60 *	are accessed without any locking.
     61 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
     62 *  	and local interrupts are disabled so slab code is preempt-safe.
     63 *  The non-constant members are protected with a per-cache irq spinlock.
     64 *
     65 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
     66 * in 2000 - many ideas in the current implementation are derived from
     67 * his patch.
     68 *
     69 * Further notes from the original documentation:
     70 *
     71 * 11 April '97.  Started multi-threading - markhe
     72 *	The global cache-chain is protected by the mutex 'slab_mutex'.
     73 *	The sem is only needed when accessing/extending the cache-chain, which
     74 *	can never happen inside an interrupt (kmem_cache_create(),
     75 *	kmem_cache_shrink() and kmem_cache_reap()).
     76 *
     77 *	At present, each engine can be growing a cache.  This should be blocked.
     78 *
     79 * 15 March 2005. NUMA slab allocator.
     80 *	Shai Fultheim <shai@scalex86.org>.
     81 *	Shobhit Dayal <shobhit@calsoftinc.com>
     82 *	Alok N Kataria <alokk@calsoftinc.com>
     83 *	Christoph Lameter <christoph@lameter.com>
     84 *
     85 *	Modified the slab allocator to be node aware on NUMA systems.
     86 *	Each node has its own list of partial, free and full slabs.
     87 *	All object allocations for a node occur from node specific slab lists.
     88 */
     89
     90#include	<linux/slab.h>
     91#include	<linux/mm.h>
     92#include	<linux/poison.h>
     93#include	<linux/swap.h>
     94#include	<linux/cache.h>
     95#include	<linux/interrupt.h>
     96#include	<linux/init.h>
     97#include	<linux/compiler.h>
     98#include	<linux/cpuset.h>
     99#include	<linux/proc_fs.h>
    100#include	<linux/seq_file.h>
    101#include	<linux/notifier.h>
    102#include	<linux/kallsyms.h>
    103#include	<linux/kfence.h>
    104#include	<linux/cpu.h>
    105#include	<linux/sysctl.h>
    106#include	<linux/module.h>
    107#include	<linux/rcupdate.h>
    108#include	<linux/string.h>
    109#include	<linux/uaccess.h>
    110#include	<linux/nodemask.h>
    111#include	<linux/kmemleak.h>
    112#include	<linux/mempolicy.h>
    113#include	<linux/mutex.h>
    114#include	<linux/fault-inject.h>
    115#include	<linux/rtmutex.h>
    116#include	<linux/reciprocal_div.h>
    117#include	<linux/debugobjects.h>
    118#include	<linux/memory.h>
    119#include	<linux/prefetch.h>
    120#include	<linux/sched/task_stack.h>
    121
    122#include	<net/sock.h>
    123
    124#include	<asm/cacheflush.h>
    125#include	<asm/tlbflush.h>
    126#include	<asm/page.h>
    127
    128#include <trace/events/kmem.h>
    129
    130#include	"internal.h"
    131
    132#include	"slab.h"
    133
    134/*
    135 * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
    136 *		  0 for faster, smaller code (especially in the critical paths).
    137 *
    138 * STATS	- 1 to collect stats for /proc/slabinfo.
    139 *		  0 for faster, smaller code (especially in the critical paths).
    140 *
    141 * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
    142 */
    143
    144#ifdef CONFIG_DEBUG_SLAB
    145#define	DEBUG		1
    146#define	STATS		1
    147#define	FORCED_DEBUG	1
    148#else
    149#define	DEBUG		0
    150#define	STATS		0
    151#define	FORCED_DEBUG	0
    152#endif
    153
    154/* Shouldn't this be in a header file somewhere? */
    155#define	BYTES_PER_WORD		sizeof(void *)
    156#define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
    157
    158#ifndef ARCH_KMALLOC_FLAGS
    159#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
    160#endif
    161
    162#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
    163				<= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
    164
    165#if FREELIST_BYTE_INDEX
    166typedef unsigned char freelist_idx_t;
    167#else
    168typedef unsigned short freelist_idx_t;
    169#endif
    170
    171#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
    172
    173/*
    174 * struct array_cache
    175 *
    176 * Purpose:
    177 * - LIFO ordering, to hand out cache-warm objects from _alloc
    178 * - reduce the number of linked list operations
    179 * - reduce spinlock operations
    180 *
    181 * The limit is stored in the per-cpu structure to reduce the data cache
    182 * footprint.
    183 *
    184 */
    185struct array_cache {
    186	unsigned int avail;
    187	unsigned int limit;
    188	unsigned int batchcount;
    189	unsigned int touched;
    190	void *entry[];	/*
    191			 * Must have this definition in here for the proper
    192			 * alignment of array_cache. Also simplifies accessing
    193			 * the entries.
    194			 */
    195};
    196
    197struct alien_cache {
    198	spinlock_t lock;
    199	struct array_cache ac;
    200};
    201
    202/*
    203 * Need this for bootstrapping a per node allocator.
    204 */
    205#define NUM_INIT_LISTS (2 * MAX_NUMNODES)
    206static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
    207#define	CACHE_CACHE 0
    208#define	SIZE_NODE (MAX_NUMNODES)
    209
    210static int drain_freelist(struct kmem_cache *cache,
    211			struct kmem_cache_node *n, int tofree);
    212static void free_block(struct kmem_cache *cachep, void **objpp, int len,
    213			int node, struct list_head *list);
    214static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
    215static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
    216static void cache_reap(struct work_struct *unused);
    217
    218static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
    219						void **list);
    220static inline void fixup_slab_list(struct kmem_cache *cachep,
    221				struct kmem_cache_node *n, struct slab *slab,
    222				void **list);
    223static int slab_early_init = 1;
    224
    225#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
    226
    227static void kmem_cache_node_init(struct kmem_cache_node *parent)
    228{
    229	INIT_LIST_HEAD(&parent->slabs_full);
    230	INIT_LIST_HEAD(&parent->slabs_partial);
    231	INIT_LIST_HEAD(&parent->slabs_free);
    232	parent->total_slabs = 0;
    233	parent->free_slabs = 0;
    234	parent->shared = NULL;
    235	parent->alien = NULL;
    236	parent->colour_next = 0;
    237	spin_lock_init(&parent->list_lock);
    238	parent->free_objects = 0;
    239	parent->free_touched = 0;
    240}
    241
    242#define MAKE_LIST(cachep, listp, slab, nodeid)				\
    243	do {								\
    244		INIT_LIST_HEAD(listp);					\
    245		list_splice(&get_node(cachep, nodeid)->slab, listp);	\
    246	} while (0)
    247
    248#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
    249	do {								\
    250	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
    251	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
    252	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
    253	} while (0)
    254
    255#define CFLGS_OBJFREELIST_SLAB	((slab_flags_t __force)0x40000000U)
    256#define CFLGS_OFF_SLAB		((slab_flags_t __force)0x80000000U)
    257#define	OBJFREELIST_SLAB(x)	((x)->flags & CFLGS_OBJFREELIST_SLAB)
    258#define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
    259
    260#define BATCHREFILL_LIMIT	16
    261/*
    262 * Optimization question: fewer reaps means less probability for unnecessary
    263 * cpucache drain/refill cycles.
    264 *
    265 * OTOH the cpuarrays can contain lots of objects,
    266 * which could lock up otherwise freeable slabs.
    267 */
    268#define REAPTIMEOUT_AC		(2*HZ)
    269#define REAPTIMEOUT_NODE	(4*HZ)
    270
    271#if STATS
    272#define	STATS_INC_ACTIVE(x)	((x)->num_active++)
    273#define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
    274#define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
    275#define	STATS_INC_GROWN(x)	((x)->grown++)
    276#define	STATS_ADD_REAPED(x, y)	((x)->reaped += (y))
    277#define	STATS_SET_HIGH(x)						\
    278	do {								\
    279		if ((x)->num_active > (x)->high_mark)			\
    280			(x)->high_mark = (x)->num_active;		\
    281	} while (0)
    282#define	STATS_INC_ERR(x)	((x)->errors++)
    283#define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
    284#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
    285#define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
    286#define	STATS_SET_FREEABLE(x, i)					\
    287	do {								\
    288		if ((x)->max_freeable < i)				\
    289			(x)->max_freeable = i;				\
    290	} while (0)
    291#define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
    292#define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
    293#define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
    294#define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
    295#else
    296#define	STATS_INC_ACTIVE(x)	do { } while (0)
    297#define	STATS_DEC_ACTIVE(x)	do { } while (0)
    298#define	STATS_INC_ALLOCED(x)	do { } while (0)
    299#define	STATS_INC_GROWN(x)	do { } while (0)
    300#define	STATS_ADD_REAPED(x, y)	do { (void)(y); } while (0)
    301#define	STATS_SET_HIGH(x)	do { } while (0)
    302#define	STATS_INC_ERR(x)	do { } while (0)
    303#define	STATS_INC_NODEALLOCS(x)	do { } while (0)
    304#define	STATS_INC_NODEFREES(x)	do { } while (0)
    305#define STATS_INC_ACOVERFLOW(x)   do { } while (0)
    306#define	STATS_SET_FREEABLE(x, i) do { } while (0)
    307#define STATS_INC_ALLOCHIT(x)	do { } while (0)
    308#define STATS_INC_ALLOCMISS(x)	do { } while (0)
    309#define STATS_INC_FREEHIT(x)	do { } while (0)
    310#define STATS_INC_FREEMISS(x)	do { } while (0)
    311#endif
    312
    313#if DEBUG
    314
    315/*
    316 * memory layout of objects:
    317 * 0		: objp
    318 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
    319 * 		the end of an object is aligned with the end of the real
    320 * 		allocation. Catches writes behind the end of the allocation.
    321 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
    322 * 		redzone word.
    323 * cachep->obj_offset: The real object.
    324 * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
    325 * cachep->size - 1* BYTES_PER_WORD: last caller address
    326 *					[BYTES_PER_WORD long]
    327 */
    328static int obj_offset(struct kmem_cache *cachep)
    329{
    330	return cachep->obj_offset;
    331}
    332
    333static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
    334{
    335	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
    336	return (unsigned long long *) (objp + obj_offset(cachep) -
    337				      sizeof(unsigned long long));
    338}
    339
    340static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
    341{
    342	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
    343	if (cachep->flags & SLAB_STORE_USER)
    344		return (unsigned long long *)(objp + cachep->size -
    345					      sizeof(unsigned long long) -
    346					      REDZONE_ALIGN);
    347	return (unsigned long long *) (objp + cachep->size -
    348				       sizeof(unsigned long long));
    349}
    350
    351static void **dbg_userword(struct kmem_cache *cachep, void *objp)
    352{
    353	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
    354	return (void **)(objp + cachep->size - BYTES_PER_WORD);
    355}
    356
    357#else
    358
    359#define obj_offset(x)			0
    360#define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
    361#define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
    362#define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
    363
    364#endif
    365
    366/*
    367 * Do not go above this order unless 0 objects fit into the slab or
    368 * overridden on the command line.
    369 */
    370#define	SLAB_MAX_ORDER_HI	1
    371#define	SLAB_MAX_ORDER_LO	0
    372static int slab_max_order = SLAB_MAX_ORDER_LO;
    373static bool slab_max_order_set __initdata;
    374
    375static inline void *index_to_obj(struct kmem_cache *cache,
    376				 const struct slab *slab, unsigned int idx)
    377{
    378	return slab->s_mem + cache->size * idx;
    379}
    380
    381#define BOOT_CPUCACHE_ENTRIES	1
    382/* internal cache of cache description objs */
    383static struct kmem_cache kmem_cache_boot = {
    384	.batchcount = 1,
    385	.limit = BOOT_CPUCACHE_ENTRIES,
    386	.shared = 1,
    387	.size = sizeof(struct kmem_cache),
    388	.name = "kmem_cache",
    389};
    390
    391static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
    392
    393static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
    394{
    395	return this_cpu_ptr(cachep->cpu_cache);
    396}
    397
    398/*
    399 * Calculate the number of objects and left-over bytes for a given buffer size.
    400 */
    401static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
    402		slab_flags_t flags, size_t *left_over)
    403{
    404	unsigned int num;
    405	size_t slab_size = PAGE_SIZE << gfporder;
    406
    407	/*
    408	 * The slab management structure can be either off the slab or
    409	 * on it. For the latter case, the memory allocated for a
    410	 * slab is used for:
    411	 *
    412	 * - @buffer_size bytes for each object
    413	 * - One freelist_idx_t for each object
    414	 *
    415	 * We don't need to consider alignment of freelist because
    416	 * freelist will be at the end of slab page. The objects will be
    417	 * at the correct alignment.
    418	 *
    419	 * If the slab management structure is off the slab, then the
    420	 * alignment will already be calculated into the size. Because
    421	 * the slabs are all pages aligned, the objects will be at the
    422	 * correct alignment when allocated.
    423	 */
    424	if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
    425		num = slab_size / buffer_size;
    426		*left_over = slab_size % buffer_size;
    427	} else {
    428		num = slab_size / (buffer_size + sizeof(freelist_idx_t));
    429		*left_over = slab_size %
    430			(buffer_size + sizeof(freelist_idx_t));
    431	}
    432
    433	return num;
    434}
    435
    436#if DEBUG
    437#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
    438
    439static void __slab_error(const char *function, struct kmem_cache *cachep,
    440			char *msg)
    441{
    442	pr_err("slab error in %s(): cache `%s': %s\n",
    443	       function, cachep->name, msg);
    444	dump_stack();
    445	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
    446}
    447#endif
    448
    449/*
    450 * By default on NUMA we use alien caches to stage the freeing of
    451 * objects allocated from other nodes. This causes massive memory
    452 * inefficiencies when using fake NUMA setup to split memory into a
    453 * large number of small nodes, so it can be disabled on the command
    454 * line
    455  */
    456
    457static int use_alien_caches __read_mostly = 1;
    458static int __init noaliencache_setup(char *s)
    459{
    460	use_alien_caches = 0;
    461	return 1;
    462}
    463__setup("noaliencache", noaliencache_setup);
    464
    465static int __init slab_max_order_setup(char *str)
    466{
    467	get_option(&str, &slab_max_order);
    468	slab_max_order = slab_max_order < 0 ? 0 :
    469				min(slab_max_order, MAX_ORDER - 1);
    470	slab_max_order_set = true;
    471
    472	return 1;
    473}
    474__setup("slab_max_order=", slab_max_order_setup);
    475
    476#ifdef CONFIG_NUMA
    477/*
    478 * Special reaping functions for NUMA systems called from cache_reap().
    479 * These take care of doing round robin flushing of alien caches (containing
    480 * objects freed on different nodes from which they were allocated) and the
    481 * flushing of remote pcps by calling drain_node_pages.
    482 */
    483static DEFINE_PER_CPU(unsigned long, slab_reap_node);
    484
    485static void init_reap_node(int cpu)
    486{
    487	per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
    488						    node_online_map);
    489}
    490
    491static void next_reap_node(void)
    492{
    493	int node = __this_cpu_read(slab_reap_node);
    494
    495	node = next_node_in(node, node_online_map);
    496	__this_cpu_write(slab_reap_node, node);
    497}
    498
    499#else
    500#define init_reap_node(cpu) do { } while (0)
    501#define next_reap_node(void) do { } while (0)
    502#endif
    503
    504/*
    505 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
    506 * via the workqueue/eventd.
    507 * Add the CPU number into the expiration time to minimize the possibility of
    508 * the CPUs getting into lockstep and contending for the global cache chain
    509 * lock.
    510 */
    511static void start_cpu_timer(int cpu)
    512{
    513	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
    514
    515	if (reap_work->work.func == NULL) {
    516		init_reap_node(cpu);
    517		INIT_DEFERRABLE_WORK(reap_work, cache_reap);
    518		schedule_delayed_work_on(cpu, reap_work,
    519					__round_jiffies_relative(HZ, cpu));
    520	}
    521}
    522
    523static void init_arraycache(struct array_cache *ac, int limit, int batch)
    524{
    525	if (ac) {
    526		ac->avail = 0;
    527		ac->limit = limit;
    528		ac->batchcount = batch;
    529		ac->touched = 0;
    530	}
    531}
    532
    533static struct array_cache *alloc_arraycache(int node, int entries,
    534					    int batchcount, gfp_t gfp)
    535{
    536	size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache);
    537	struct array_cache *ac = NULL;
    538
    539	ac = kmalloc_node(memsize, gfp, node);
    540	/*
    541	 * The array_cache structures contain pointers to free object.
    542	 * However, when such objects are allocated or transferred to another
    543	 * cache the pointers are not cleared and they could be counted as
    544	 * valid references during a kmemleak scan. Therefore, kmemleak must
    545	 * not scan such objects.
    546	 */
    547	kmemleak_no_scan(ac);
    548	init_arraycache(ac, entries, batchcount);
    549	return ac;
    550}
    551
    552static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
    553					struct slab *slab, void *objp)
    554{
    555	struct kmem_cache_node *n;
    556	int slab_node;
    557	LIST_HEAD(list);
    558
    559	slab_node = slab_nid(slab);
    560	n = get_node(cachep, slab_node);
    561
    562	spin_lock(&n->list_lock);
    563	free_block(cachep, &objp, 1, slab_node, &list);
    564	spin_unlock(&n->list_lock);
    565
    566	slabs_destroy(cachep, &list);
    567}
    568
    569/*
    570 * Transfer objects in one arraycache to another.
    571 * Locking must be handled by the caller.
    572 *
    573 * Return the number of entries transferred.
    574 */
    575static int transfer_objects(struct array_cache *to,
    576		struct array_cache *from, unsigned int max)
    577{
    578	/* Figure out how many entries to transfer */
    579	int nr = min3(from->avail, max, to->limit - to->avail);
    580
    581	if (!nr)
    582		return 0;
    583
    584	memcpy(to->entry + to->avail, from->entry + from->avail - nr,
    585			sizeof(void *) *nr);
    586
    587	from->avail -= nr;
    588	to->avail += nr;
    589	return nr;
    590}
    591
    592/* &alien->lock must be held by alien callers. */
    593static __always_inline void __free_one(struct array_cache *ac, void *objp)
    594{
    595	/* Avoid trivial double-free. */
    596	if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
    597	    WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp))
    598		return;
    599	ac->entry[ac->avail++] = objp;
    600}
    601
    602#ifndef CONFIG_NUMA
    603
    604#define drain_alien_cache(cachep, alien) do { } while (0)
    605#define reap_alien(cachep, n) do { } while (0)
    606
    607static inline struct alien_cache **alloc_alien_cache(int node,
    608						int limit, gfp_t gfp)
    609{
    610	return NULL;
    611}
    612
    613static inline void free_alien_cache(struct alien_cache **ac_ptr)
    614{
    615}
    616
    617static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
    618{
    619	return 0;
    620}
    621
    622static inline gfp_t gfp_exact_node(gfp_t flags)
    623{
    624	return flags & ~__GFP_NOFAIL;
    625}
    626
    627#else	/* CONFIG_NUMA */
    628
    629static struct alien_cache *__alloc_alien_cache(int node, int entries,
    630						int batch, gfp_t gfp)
    631{
    632	size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache);
    633	struct alien_cache *alc = NULL;
    634
    635	alc = kmalloc_node(memsize, gfp, node);
    636	if (alc) {
    637		kmemleak_no_scan(alc);
    638		init_arraycache(&alc->ac, entries, batch);
    639		spin_lock_init(&alc->lock);
    640	}
    641	return alc;
    642}
    643
    644static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
    645{
    646	struct alien_cache **alc_ptr;
    647	int i;
    648
    649	if (limit > 1)
    650		limit = 12;
    651	alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node);
    652	if (!alc_ptr)
    653		return NULL;
    654
    655	for_each_node(i) {
    656		if (i == node || !node_online(i))
    657			continue;
    658		alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp);
    659		if (!alc_ptr[i]) {
    660			for (i--; i >= 0; i--)
    661				kfree(alc_ptr[i]);
    662			kfree(alc_ptr);
    663			return NULL;
    664		}
    665	}
    666	return alc_ptr;
    667}
    668
    669static void free_alien_cache(struct alien_cache **alc_ptr)
    670{
    671	int i;
    672
    673	if (!alc_ptr)
    674		return;
    675	for_each_node(i)
    676	    kfree(alc_ptr[i]);
    677	kfree(alc_ptr);
    678}
    679
    680static void __drain_alien_cache(struct kmem_cache *cachep,
    681				struct array_cache *ac, int node,
    682				struct list_head *list)
    683{
    684	struct kmem_cache_node *n = get_node(cachep, node);
    685
    686	if (ac->avail) {
    687		spin_lock(&n->list_lock);
    688		/*
    689		 * Stuff objects into the remote nodes shared array first.
    690		 * That way we could avoid the overhead of putting the objects
    691		 * into the free lists and getting them back later.
    692		 */
    693		if (n->shared)
    694			transfer_objects(n->shared, ac, ac->limit);
    695
    696		free_block(cachep, ac->entry, ac->avail, node, list);
    697		ac->avail = 0;
    698		spin_unlock(&n->list_lock);
    699	}
    700}
    701
    702/*
    703 * Called from cache_reap() to regularly drain alien caches round robin.
    704 */
    705static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
    706{
    707	int node = __this_cpu_read(slab_reap_node);
    708
    709	if (n->alien) {
    710		struct alien_cache *alc = n->alien[node];
    711		struct array_cache *ac;
    712
    713		if (alc) {
    714			ac = &alc->ac;
    715			if (ac->avail && spin_trylock_irq(&alc->lock)) {
    716				LIST_HEAD(list);
    717
    718				__drain_alien_cache(cachep, ac, node, &list);
    719				spin_unlock_irq(&alc->lock);
    720				slabs_destroy(cachep, &list);
    721			}
    722		}
    723	}
    724}
    725
    726static void drain_alien_cache(struct kmem_cache *cachep,
    727				struct alien_cache **alien)
    728{
    729	int i = 0;
    730	struct alien_cache *alc;
    731	struct array_cache *ac;
    732	unsigned long flags;
    733
    734	for_each_online_node(i) {
    735		alc = alien[i];
    736		if (alc) {
    737			LIST_HEAD(list);
    738
    739			ac = &alc->ac;
    740			spin_lock_irqsave(&alc->lock, flags);
    741			__drain_alien_cache(cachep, ac, i, &list);
    742			spin_unlock_irqrestore(&alc->lock, flags);
    743			slabs_destroy(cachep, &list);
    744		}
    745	}
    746}
    747
    748static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
    749				int node, int slab_node)
    750{
    751	struct kmem_cache_node *n;
    752	struct alien_cache *alien = NULL;
    753	struct array_cache *ac;
    754	LIST_HEAD(list);
    755
    756	n = get_node(cachep, node);
    757	STATS_INC_NODEFREES(cachep);
    758	if (n->alien && n->alien[slab_node]) {
    759		alien = n->alien[slab_node];
    760		ac = &alien->ac;
    761		spin_lock(&alien->lock);
    762		if (unlikely(ac->avail == ac->limit)) {
    763			STATS_INC_ACOVERFLOW(cachep);
    764			__drain_alien_cache(cachep, ac, slab_node, &list);
    765		}
    766		__free_one(ac, objp);
    767		spin_unlock(&alien->lock);
    768		slabs_destroy(cachep, &list);
    769	} else {
    770		n = get_node(cachep, slab_node);
    771		spin_lock(&n->list_lock);
    772		free_block(cachep, &objp, 1, slab_node, &list);
    773		spin_unlock(&n->list_lock);
    774		slabs_destroy(cachep, &list);
    775	}
    776	return 1;
    777}
    778
    779static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
    780{
    781	int slab_node = slab_nid(virt_to_slab(objp));
    782	int node = numa_mem_id();
    783	/*
    784	 * Make sure we are not freeing an object from another node to the array
    785	 * cache on this cpu.
    786	 */
    787	if (likely(node == slab_node))
    788		return 0;
    789
    790	return __cache_free_alien(cachep, objp, node, slab_node);
    791}
    792
    793/*
    794 * Construct gfp mask to allocate from a specific node but do not reclaim or
    795 * warn about failures.
    796 */
    797static inline gfp_t gfp_exact_node(gfp_t flags)
    798{
    799	return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
    800}
    801#endif
    802
    803static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
    804{
    805	struct kmem_cache_node *n;
    806
    807	/*
    808	 * Set up the kmem_cache_node for cpu before we can
    809	 * begin anything. Make sure some other cpu on this
    810	 * node has not already allocated this
    811	 */
    812	n = get_node(cachep, node);
    813	if (n) {
    814		spin_lock_irq(&n->list_lock);
    815		n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
    816				cachep->num;
    817		spin_unlock_irq(&n->list_lock);
    818
    819		return 0;
    820	}
    821
    822	n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
    823	if (!n)
    824		return -ENOMEM;
    825
    826	kmem_cache_node_init(n);
    827	n->next_reap = jiffies + REAPTIMEOUT_NODE +
    828		    ((unsigned long)cachep) % REAPTIMEOUT_NODE;
    829
    830	n->free_limit =
    831		(1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
    832
    833	/*
    834	 * The kmem_cache_nodes don't come and go as CPUs
    835	 * come and go.  slab_mutex provides sufficient
    836	 * protection here.
    837	 */
    838	cachep->node[node] = n;
    839
    840	return 0;
    841}
    842
    843#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP)
    844/*
    845 * Allocates and initializes node for a node on each slab cache, used for
    846 * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node
    847 * will be allocated off-node since memory is not yet online for the new node.
    848 * When hotplugging memory or a cpu, existing nodes are not replaced if
    849 * already in use.
    850 *
    851 * Must hold slab_mutex.
    852 */
    853static int init_cache_node_node(int node)
    854{
    855	int ret;
    856	struct kmem_cache *cachep;
    857
    858	list_for_each_entry(cachep, &slab_caches, list) {
    859		ret = init_cache_node(cachep, node, GFP_KERNEL);
    860		if (ret)
    861			return ret;
    862	}
    863
    864	return 0;
    865}
    866#endif
    867
    868static int setup_kmem_cache_node(struct kmem_cache *cachep,
    869				int node, gfp_t gfp, bool force_change)
    870{
    871	int ret = -ENOMEM;
    872	struct kmem_cache_node *n;
    873	struct array_cache *old_shared = NULL;
    874	struct array_cache *new_shared = NULL;
    875	struct alien_cache **new_alien = NULL;
    876	LIST_HEAD(list);
    877
    878	if (use_alien_caches) {
    879		new_alien = alloc_alien_cache(node, cachep->limit, gfp);
    880		if (!new_alien)
    881			goto fail;
    882	}
    883
    884	if (cachep->shared) {
    885		new_shared = alloc_arraycache(node,
    886			cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
    887		if (!new_shared)
    888			goto fail;
    889	}
    890
    891	ret = init_cache_node(cachep, node, gfp);
    892	if (ret)
    893		goto fail;
    894
    895	n = get_node(cachep, node);
    896	spin_lock_irq(&n->list_lock);
    897	if (n->shared && force_change) {
    898		free_block(cachep, n->shared->entry,
    899				n->shared->avail, node, &list);
    900		n->shared->avail = 0;
    901	}
    902
    903	if (!n->shared || force_change) {
    904		old_shared = n->shared;
    905		n->shared = new_shared;
    906		new_shared = NULL;
    907	}
    908
    909	if (!n->alien) {
    910		n->alien = new_alien;
    911		new_alien = NULL;
    912	}
    913
    914	spin_unlock_irq(&n->list_lock);
    915	slabs_destroy(cachep, &list);
    916
    917	/*
    918	 * To protect lockless access to n->shared during irq disabled context.
    919	 * If n->shared isn't NULL in irq disabled context, accessing to it is
    920	 * guaranteed to be valid until irq is re-enabled, because it will be
    921	 * freed after synchronize_rcu().
    922	 */
    923	if (old_shared && force_change)
    924		synchronize_rcu();
    925
    926fail:
    927	kfree(old_shared);
    928	kfree(new_shared);
    929	free_alien_cache(new_alien);
    930
    931	return ret;
    932}
    933
    934#ifdef CONFIG_SMP
    935
    936static void cpuup_canceled(long cpu)
    937{
    938	struct kmem_cache *cachep;
    939	struct kmem_cache_node *n = NULL;
    940	int node = cpu_to_mem(cpu);
    941	const struct cpumask *mask = cpumask_of_node(node);
    942
    943	list_for_each_entry(cachep, &slab_caches, list) {
    944		struct array_cache *nc;
    945		struct array_cache *shared;
    946		struct alien_cache **alien;
    947		LIST_HEAD(list);
    948
    949		n = get_node(cachep, node);
    950		if (!n)
    951			continue;
    952
    953		spin_lock_irq(&n->list_lock);
    954
    955		/* Free limit for this kmem_cache_node */
    956		n->free_limit -= cachep->batchcount;
    957
    958		/* cpu is dead; no one can alloc from it. */
    959		nc = per_cpu_ptr(cachep->cpu_cache, cpu);
    960		free_block(cachep, nc->entry, nc->avail, node, &list);
    961		nc->avail = 0;
    962
    963		if (!cpumask_empty(mask)) {
    964			spin_unlock_irq(&n->list_lock);
    965			goto free_slab;
    966		}
    967
    968		shared = n->shared;
    969		if (shared) {
    970			free_block(cachep, shared->entry,
    971				   shared->avail, node, &list);
    972			n->shared = NULL;
    973		}
    974
    975		alien = n->alien;
    976		n->alien = NULL;
    977
    978		spin_unlock_irq(&n->list_lock);
    979
    980		kfree(shared);
    981		if (alien) {
    982			drain_alien_cache(cachep, alien);
    983			free_alien_cache(alien);
    984		}
    985
    986free_slab:
    987		slabs_destroy(cachep, &list);
    988	}
    989	/*
    990	 * In the previous loop, all the objects were freed to
    991	 * the respective cache's slabs,  now we can go ahead and
    992	 * shrink each nodelist to its limit.
    993	 */
    994	list_for_each_entry(cachep, &slab_caches, list) {
    995		n = get_node(cachep, node);
    996		if (!n)
    997			continue;
    998		drain_freelist(cachep, n, INT_MAX);
    999	}
   1000}
   1001
   1002static int cpuup_prepare(long cpu)
   1003{
   1004	struct kmem_cache *cachep;
   1005	int node = cpu_to_mem(cpu);
   1006	int err;
   1007
   1008	/*
   1009	 * We need to do this right in the beginning since
   1010	 * alloc_arraycache's are going to use this list.
   1011	 * kmalloc_node allows us to add the slab to the right
   1012	 * kmem_cache_node and not this cpu's kmem_cache_node
   1013	 */
   1014	err = init_cache_node_node(node);
   1015	if (err < 0)
   1016		goto bad;
   1017
   1018	/*
   1019	 * Now we can go ahead with allocating the shared arrays and
   1020	 * array caches
   1021	 */
   1022	list_for_each_entry(cachep, &slab_caches, list) {
   1023		err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
   1024		if (err)
   1025			goto bad;
   1026	}
   1027
   1028	return 0;
   1029bad:
   1030	cpuup_canceled(cpu);
   1031	return -ENOMEM;
   1032}
   1033
   1034int slab_prepare_cpu(unsigned int cpu)
   1035{
   1036	int err;
   1037
   1038	mutex_lock(&slab_mutex);
   1039	err = cpuup_prepare(cpu);
   1040	mutex_unlock(&slab_mutex);
   1041	return err;
   1042}
   1043
   1044/*
   1045 * This is called for a failed online attempt and for a successful
   1046 * offline.
   1047 *
   1048 * Even if all the cpus of a node are down, we don't free the
   1049 * kmem_cache_node of any cache. This is to avoid a race between cpu_down, and
   1050 * a kmalloc allocation from another cpu for memory from the node of
   1051 * the cpu going down.  The kmem_cache_node structure is usually allocated from
   1052 * kmem_cache_create() and gets destroyed at kmem_cache_destroy().
   1053 */
   1054int slab_dead_cpu(unsigned int cpu)
   1055{
   1056	mutex_lock(&slab_mutex);
   1057	cpuup_canceled(cpu);
   1058	mutex_unlock(&slab_mutex);
   1059	return 0;
   1060}
   1061#endif
   1062
   1063static int slab_online_cpu(unsigned int cpu)
   1064{
   1065	start_cpu_timer(cpu);
   1066	return 0;
   1067}
   1068
   1069static int slab_offline_cpu(unsigned int cpu)
   1070{
   1071	/*
   1072	 * Shutdown cache reaper. Note that the slab_mutex is held so
   1073	 * that if cache_reap() is invoked it cannot do anything
   1074	 * expensive but will only modify reap_work and reschedule the
   1075	 * timer.
   1076	 */
   1077	cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
   1078	/* Now the cache_reaper is guaranteed to be not running. */
   1079	per_cpu(slab_reap_work, cpu).work.func = NULL;
   1080	return 0;
   1081}
   1082
   1083#if defined(CONFIG_NUMA)
   1084/*
   1085 * Drains freelist for a node on each slab cache, used for memory hot-remove.
   1086 * Returns -EBUSY if all objects cannot be drained so that the node is not
   1087 * removed.
   1088 *
   1089 * Must hold slab_mutex.
   1090 */
   1091static int __meminit drain_cache_node_node(int node)
   1092{
   1093	struct kmem_cache *cachep;
   1094	int ret = 0;
   1095
   1096	list_for_each_entry(cachep, &slab_caches, list) {
   1097		struct kmem_cache_node *n;
   1098
   1099		n = get_node(cachep, node);
   1100		if (!n)
   1101			continue;
   1102
   1103		drain_freelist(cachep, n, INT_MAX);
   1104
   1105		if (!list_empty(&n->slabs_full) ||
   1106		    !list_empty(&n->slabs_partial)) {
   1107			ret = -EBUSY;
   1108			break;
   1109		}
   1110	}
   1111	return ret;
   1112}
   1113
   1114static int __meminit slab_memory_callback(struct notifier_block *self,
   1115					unsigned long action, void *arg)
   1116{
   1117	struct memory_notify *mnb = arg;
   1118	int ret = 0;
   1119	int nid;
   1120
   1121	nid = mnb->status_change_nid;
   1122	if (nid < 0)
   1123		goto out;
   1124
   1125	switch (action) {
   1126	case MEM_GOING_ONLINE:
   1127		mutex_lock(&slab_mutex);
   1128		ret = init_cache_node_node(nid);
   1129		mutex_unlock(&slab_mutex);
   1130		break;
   1131	case MEM_GOING_OFFLINE:
   1132		mutex_lock(&slab_mutex);
   1133		ret = drain_cache_node_node(nid);
   1134		mutex_unlock(&slab_mutex);
   1135		break;
   1136	case MEM_ONLINE:
   1137	case MEM_OFFLINE:
   1138	case MEM_CANCEL_ONLINE:
   1139	case MEM_CANCEL_OFFLINE:
   1140		break;
   1141	}
   1142out:
   1143	return notifier_from_errno(ret);
   1144}
   1145#endif /* CONFIG_NUMA */
   1146
   1147/*
   1148 * swap the static kmem_cache_node with kmalloced memory
   1149 */
   1150static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
   1151				int nodeid)
   1152{
   1153	struct kmem_cache_node *ptr;
   1154
   1155	ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
   1156	BUG_ON(!ptr);
   1157
   1158	memcpy(ptr, list, sizeof(struct kmem_cache_node));
   1159	/*
   1160	 * Do not assume that spinlocks can be initialized via memcpy:
   1161	 */
   1162	spin_lock_init(&ptr->list_lock);
   1163
   1164	MAKE_ALL_LISTS(cachep, ptr, nodeid);
   1165	cachep->node[nodeid] = ptr;
   1166}
   1167
   1168/*
   1169 * For setting up all the kmem_cache_node for cache whose buffer_size is same as
   1170 * size of kmem_cache_node.
   1171 */
   1172static void __init set_up_node(struct kmem_cache *cachep, int index)
   1173{
   1174	int node;
   1175
   1176	for_each_online_node(node) {
   1177		cachep->node[node] = &init_kmem_cache_node[index + node];
   1178		cachep->node[node]->next_reap = jiffies +
   1179		    REAPTIMEOUT_NODE +
   1180		    ((unsigned long)cachep) % REAPTIMEOUT_NODE;
   1181	}
   1182}
   1183
   1184/*
   1185 * Initialisation.  Called after the page allocator have been initialised and
   1186 * before smp_init().
   1187 */
   1188void __init kmem_cache_init(void)
   1189{
   1190	int i;
   1191
   1192	kmem_cache = &kmem_cache_boot;
   1193
   1194	if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
   1195		use_alien_caches = 0;
   1196
   1197	for (i = 0; i < NUM_INIT_LISTS; i++)
   1198		kmem_cache_node_init(&init_kmem_cache_node[i]);
   1199
   1200	/*
   1201	 * Fragmentation resistance on low memory - only use bigger
   1202	 * page orders on machines with more than 32MB of memory if
   1203	 * not overridden on the command line.
   1204	 */
   1205	if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT)
   1206		slab_max_order = SLAB_MAX_ORDER_HI;
   1207
   1208	/* Bootstrap is tricky, because several objects are allocated
   1209	 * from caches that do not exist yet:
   1210	 * 1) initialize the kmem_cache cache: it contains the struct
   1211	 *    kmem_cache structures of all caches, except kmem_cache itself:
   1212	 *    kmem_cache is statically allocated.
   1213	 *    Initially an __init data area is used for the head array and the
   1214	 *    kmem_cache_node structures, it's replaced with a kmalloc allocated
   1215	 *    array at the end of the bootstrap.
   1216	 * 2) Create the first kmalloc cache.
   1217	 *    The struct kmem_cache for the new cache is allocated normally.
   1218	 *    An __init data area is used for the head array.
   1219	 * 3) Create the remaining kmalloc caches, with minimally sized
   1220	 *    head arrays.
   1221	 * 4) Replace the __init data head arrays for kmem_cache and the first
   1222	 *    kmalloc cache with kmalloc allocated arrays.
   1223	 * 5) Replace the __init data for kmem_cache_node for kmem_cache and
   1224	 *    the other cache's with kmalloc allocated memory.
   1225	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
   1226	 */
   1227
   1228	/* 1) create the kmem_cache */
   1229
   1230	/*
   1231	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
   1232	 */
   1233	create_boot_cache(kmem_cache, "kmem_cache",
   1234		offsetof(struct kmem_cache, node) +
   1235				  nr_node_ids * sizeof(struct kmem_cache_node *),
   1236				  SLAB_HWCACHE_ALIGN, 0, 0);
   1237	list_add(&kmem_cache->list, &slab_caches);
   1238	slab_state = PARTIAL;
   1239
   1240	/*
   1241	 * Initialize the caches that provide memory for the  kmem_cache_node
   1242	 * structures first.  Without this, further allocations will bug.
   1243	 */
   1244	kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
   1245				kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL],
   1246				kmalloc_info[INDEX_NODE].size,
   1247				ARCH_KMALLOC_FLAGS, 0,
   1248				kmalloc_info[INDEX_NODE].size);
   1249	slab_state = PARTIAL_NODE;
   1250	setup_kmalloc_cache_index_table();
   1251
   1252	slab_early_init = 0;
   1253
   1254	/* 5) Replace the bootstrap kmem_cache_node */
   1255	{
   1256		int nid;
   1257
   1258		for_each_online_node(nid) {
   1259			init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
   1260
   1261			init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
   1262					  &init_kmem_cache_node[SIZE_NODE + nid], nid);
   1263		}
   1264	}
   1265
   1266	create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
   1267}
   1268
   1269void __init kmem_cache_init_late(void)
   1270{
   1271	struct kmem_cache *cachep;
   1272
   1273	/* 6) resize the head arrays to their final sizes */
   1274	mutex_lock(&slab_mutex);
   1275	list_for_each_entry(cachep, &slab_caches, list)
   1276		if (enable_cpucache(cachep, GFP_NOWAIT))
   1277			BUG();
   1278	mutex_unlock(&slab_mutex);
   1279
   1280	/* Done! */
   1281	slab_state = FULL;
   1282
   1283#ifdef CONFIG_NUMA
   1284	/*
   1285	 * Register a memory hotplug callback that initializes and frees
   1286	 * node.
   1287	 */
   1288	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
   1289#endif
   1290
   1291	/*
   1292	 * The reap timers are started later, with a module init call: That part
   1293	 * of the kernel is not yet operational.
   1294	 */
   1295}
   1296
   1297static int __init cpucache_init(void)
   1298{
   1299	int ret;
   1300
   1301	/*
   1302	 * Register the timers that return unneeded pages to the page allocator
   1303	 */
   1304	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online",
   1305				slab_online_cpu, slab_offline_cpu);
   1306	WARN_ON(ret < 0);
   1307
   1308	return 0;
   1309}
   1310__initcall(cpucache_init);
   1311
   1312static noinline void
   1313slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
   1314{
   1315#if DEBUG
   1316	struct kmem_cache_node *n;
   1317	unsigned long flags;
   1318	int node;
   1319	static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
   1320				      DEFAULT_RATELIMIT_BURST);
   1321
   1322	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
   1323		return;
   1324
   1325	pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
   1326		nodeid, gfpflags, &gfpflags);
   1327	pr_warn("  cache: %s, object size: %d, order: %d\n",
   1328		cachep->name, cachep->size, cachep->gfporder);
   1329
   1330	for_each_kmem_cache_node(cachep, node, n) {
   1331		unsigned long total_slabs, free_slabs, free_objs;
   1332
   1333		spin_lock_irqsave(&n->list_lock, flags);
   1334		total_slabs = n->total_slabs;
   1335		free_slabs = n->free_slabs;
   1336		free_objs = n->free_objects;
   1337		spin_unlock_irqrestore(&n->list_lock, flags);
   1338
   1339		pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
   1340			node, total_slabs - free_slabs, total_slabs,
   1341			(total_slabs * cachep->num) - free_objs,
   1342			total_slabs * cachep->num);
   1343	}
   1344#endif
   1345}
   1346
   1347/*
   1348 * Interface to system's page allocator. No need to hold the
   1349 * kmem_cache_node ->list_lock.
   1350 *
   1351 * If we requested dmaable memory, we will get it. Even if we
   1352 * did not request dmaable memory, we might get it, but that
   1353 * would be relatively rare and ignorable.
   1354 */
   1355static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
   1356								int nodeid)
   1357{
   1358	struct folio *folio;
   1359	struct slab *slab;
   1360
   1361	flags |= cachep->allocflags;
   1362
   1363	folio = (struct folio *) __alloc_pages_node(nodeid, flags, cachep->gfporder);
   1364	if (!folio) {
   1365		slab_out_of_memory(cachep, flags, nodeid);
   1366		return NULL;
   1367	}
   1368
   1369	slab = folio_slab(folio);
   1370
   1371	account_slab(slab, cachep->gfporder, cachep, flags);
   1372	__folio_set_slab(folio);
   1373	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
   1374	if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0)))
   1375		slab_set_pfmemalloc(slab);
   1376
   1377	return slab;
   1378}
   1379
   1380/*
   1381 * Interface to system's page release.
   1382 */
   1383static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
   1384{
   1385	int order = cachep->gfporder;
   1386	struct folio *folio = slab_folio(slab);
   1387
   1388	BUG_ON(!folio_test_slab(folio));
   1389	__slab_clear_pfmemalloc(slab);
   1390	__folio_clear_slab(folio);
   1391	page_mapcount_reset(folio_page(folio, 0));
   1392	folio->mapping = NULL;
   1393
   1394	if (current->reclaim_state)
   1395		current->reclaim_state->reclaimed_slab += 1 << order;
   1396	unaccount_slab(slab, order, cachep);
   1397	__free_pages(folio_page(folio, 0), order);
   1398}
   1399
   1400static void kmem_rcu_free(struct rcu_head *head)
   1401{
   1402	struct kmem_cache *cachep;
   1403	struct slab *slab;
   1404
   1405	slab = container_of(head, struct slab, rcu_head);
   1406	cachep = slab->slab_cache;
   1407
   1408	kmem_freepages(cachep, slab);
   1409}
   1410
   1411#if DEBUG
   1412static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
   1413{
   1414	if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
   1415		(cachep->size % PAGE_SIZE) == 0)
   1416		return true;
   1417
   1418	return false;
   1419}
   1420
   1421#ifdef CONFIG_DEBUG_PAGEALLOC
   1422static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
   1423{
   1424	if (!is_debug_pagealloc_cache(cachep))
   1425		return;
   1426
   1427	__kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
   1428}
   1429
   1430#else
   1431static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
   1432				int map) {}
   1433
   1434#endif
   1435
   1436static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
   1437{
   1438	int size = cachep->object_size;
   1439	addr = &((char *)addr)[obj_offset(cachep)];
   1440
   1441	memset(addr, val, size);
   1442	*(unsigned char *)(addr + size - 1) = POISON_END;
   1443}
   1444
   1445static void dump_line(char *data, int offset, int limit)
   1446{
   1447	int i;
   1448	unsigned char error = 0;
   1449	int bad_count = 0;
   1450
   1451	pr_err("%03x: ", offset);
   1452	for (i = 0; i < limit; i++) {
   1453		if (data[offset + i] != POISON_FREE) {
   1454			error = data[offset + i];
   1455			bad_count++;
   1456		}
   1457	}
   1458	print_hex_dump(KERN_CONT, "", 0, 16, 1,
   1459			&data[offset], limit, 1);
   1460
   1461	if (bad_count == 1) {
   1462		error ^= POISON_FREE;
   1463		if (!(error & (error - 1))) {
   1464			pr_err("Single bit error detected. Probably bad RAM.\n");
   1465#ifdef CONFIG_X86
   1466			pr_err("Run memtest86+ or a similar memory test tool.\n");
   1467#else
   1468			pr_err("Run a memory test tool.\n");
   1469#endif
   1470		}
   1471	}
   1472}
   1473#endif
   1474
   1475#if DEBUG
   1476
   1477static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
   1478{
   1479	int i, size;
   1480	char *realobj;
   1481
   1482	if (cachep->flags & SLAB_RED_ZONE) {
   1483		pr_err("Redzone: 0x%llx/0x%llx\n",
   1484		       *dbg_redzone1(cachep, objp),
   1485		       *dbg_redzone2(cachep, objp));
   1486	}
   1487
   1488	if (cachep->flags & SLAB_STORE_USER)
   1489		pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp));
   1490	realobj = (char *)objp + obj_offset(cachep);
   1491	size = cachep->object_size;
   1492	for (i = 0; i < size && lines; i += 16, lines--) {
   1493		int limit;
   1494		limit = 16;
   1495		if (i + limit > size)
   1496			limit = size - i;
   1497		dump_line(realobj, i, limit);
   1498	}
   1499}
   1500
   1501static void check_poison_obj(struct kmem_cache *cachep, void *objp)
   1502{
   1503	char *realobj;
   1504	int size, i;
   1505	int lines = 0;
   1506
   1507	if (is_debug_pagealloc_cache(cachep))
   1508		return;
   1509
   1510	realobj = (char *)objp + obj_offset(cachep);
   1511	size = cachep->object_size;
   1512
   1513	for (i = 0; i < size; i++) {
   1514		char exp = POISON_FREE;
   1515		if (i == size - 1)
   1516			exp = POISON_END;
   1517		if (realobj[i] != exp) {
   1518			int limit;
   1519			/* Mismatch ! */
   1520			/* Print header */
   1521			if (lines == 0) {
   1522				pr_err("Slab corruption (%s): %s start=%px, len=%d\n",
   1523				       print_tainted(), cachep->name,
   1524				       realobj, size);
   1525				print_objinfo(cachep, objp, 0);
   1526			}
   1527			/* Hexdump the affected line */
   1528			i = (i / 16) * 16;
   1529			limit = 16;
   1530			if (i + limit > size)
   1531				limit = size - i;
   1532			dump_line(realobj, i, limit);
   1533			i += 16;
   1534			lines++;
   1535			/* Limit to 5 lines */
   1536			if (lines > 5)
   1537				break;
   1538		}
   1539	}
   1540	if (lines != 0) {
   1541		/* Print some data about the neighboring objects, if they
   1542		 * exist:
   1543		 */
   1544		struct slab *slab = virt_to_slab(objp);
   1545		unsigned int objnr;
   1546
   1547		objnr = obj_to_index(cachep, slab, objp);
   1548		if (objnr) {
   1549			objp = index_to_obj(cachep, slab, objnr - 1);
   1550			realobj = (char *)objp + obj_offset(cachep);
   1551			pr_err("Prev obj: start=%px, len=%d\n", realobj, size);
   1552			print_objinfo(cachep, objp, 2);
   1553		}
   1554		if (objnr + 1 < cachep->num) {
   1555			objp = index_to_obj(cachep, slab, objnr + 1);
   1556			realobj = (char *)objp + obj_offset(cachep);
   1557			pr_err("Next obj: start=%px, len=%d\n", realobj, size);
   1558			print_objinfo(cachep, objp, 2);
   1559		}
   1560	}
   1561}
   1562#endif
   1563
   1564#if DEBUG
   1565static void slab_destroy_debugcheck(struct kmem_cache *cachep,
   1566						struct slab *slab)
   1567{
   1568	int i;
   1569
   1570	if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
   1571		poison_obj(cachep, slab->freelist - obj_offset(cachep),
   1572			POISON_FREE);
   1573	}
   1574
   1575	for (i = 0; i < cachep->num; i++) {
   1576		void *objp = index_to_obj(cachep, slab, i);
   1577
   1578		if (cachep->flags & SLAB_POISON) {
   1579			check_poison_obj(cachep, objp);
   1580			slab_kernel_map(cachep, objp, 1);
   1581		}
   1582		if (cachep->flags & SLAB_RED_ZONE) {
   1583			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
   1584				slab_error(cachep, "start of a freed object was overwritten");
   1585			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
   1586				slab_error(cachep, "end of a freed object was overwritten");
   1587		}
   1588	}
   1589}
   1590#else
   1591static void slab_destroy_debugcheck(struct kmem_cache *cachep,
   1592						struct slab *slab)
   1593{
   1594}
   1595#endif
   1596
   1597/**
   1598 * slab_destroy - destroy and release all objects in a slab
   1599 * @cachep: cache pointer being destroyed
   1600 * @slab: slab being destroyed
   1601 *
   1602 * Destroy all the objs in a slab, and release the mem back to the system.
   1603 * Before calling the slab must have been unlinked from the cache. The
   1604 * kmem_cache_node ->list_lock is not held/needed.
   1605 */
   1606static void slab_destroy(struct kmem_cache *cachep, struct slab *slab)
   1607{
   1608	void *freelist;
   1609
   1610	freelist = slab->freelist;
   1611	slab_destroy_debugcheck(cachep, slab);
   1612	if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
   1613		call_rcu(&slab->rcu_head, kmem_rcu_free);
   1614	else
   1615		kmem_freepages(cachep, slab);
   1616
   1617	/*
   1618	 * From now on, we don't use freelist
   1619	 * although actual page can be freed in rcu context
   1620	 */
   1621	if (OFF_SLAB(cachep))
   1622		kmem_cache_free(cachep->freelist_cache, freelist);
   1623}
   1624
   1625/*
   1626 * Update the size of the caches before calling slabs_destroy as it may
   1627 * recursively call kfree.
   1628 */
   1629static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
   1630{
   1631	struct slab *slab, *n;
   1632
   1633	list_for_each_entry_safe(slab, n, list, slab_list) {
   1634		list_del(&slab->slab_list);
   1635		slab_destroy(cachep, slab);
   1636	}
   1637}
   1638
   1639/**
   1640 * calculate_slab_order - calculate size (page order) of slabs
   1641 * @cachep: pointer to the cache that is being created
   1642 * @size: size of objects to be created in this cache.
   1643 * @flags: slab allocation flags
   1644 *
   1645 * Also calculates the number of objects per slab.
   1646 *
   1647 * This could be made much more intelligent.  For now, try to avoid using
   1648 * high order pages for slabs.  When the gfp() functions are more friendly
   1649 * towards high-order requests, this should be changed.
   1650 *
   1651 * Return: number of left-over bytes in a slab
   1652 */
   1653static size_t calculate_slab_order(struct kmem_cache *cachep,
   1654				size_t size, slab_flags_t flags)
   1655{
   1656	size_t left_over = 0;
   1657	int gfporder;
   1658
   1659	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
   1660		unsigned int num;
   1661		size_t remainder;
   1662
   1663		num = cache_estimate(gfporder, size, flags, &remainder);
   1664		if (!num)
   1665			continue;
   1666
   1667		/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
   1668		if (num > SLAB_OBJ_MAX_NUM)
   1669			break;
   1670
   1671		if (flags & CFLGS_OFF_SLAB) {
   1672			struct kmem_cache *freelist_cache;
   1673			size_t freelist_size;
   1674
   1675			freelist_size = num * sizeof(freelist_idx_t);
   1676			freelist_cache = kmalloc_slab(freelist_size, 0u);
   1677			if (!freelist_cache)
   1678				continue;
   1679
   1680			/*
   1681			 * Needed to avoid possible looping condition
   1682			 * in cache_grow_begin()
   1683			 */
   1684			if (OFF_SLAB(freelist_cache))
   1685				continue;
   1686
   1687			/* check if off slab has enough benefit */
   1688			if (freelist_cache->size > cachep->size / 2)
   1689				continue;
   1690		}
   1691
   1692		/* Found something acceptable - save it away */
   1693		cachep->num = num;
   1694		cachep->gfporder = gfporder;
   1695		left_over = remainder;
   1696
   1697		/*
   1698		 * A VFS-reclaimable slab tends to have most allocations
   1699		 * as GFP_NOFS and we really don't want to have to be allocating
   1700		 * higher-order pages when we are unable to shrink dcache.
   1701		 */
   1702		if (flags & SLAB_RECLAIM_ACCOUNT)
   1703			break;
   1704
   1705		/*
   1706		 * Large number of objects is good, but very large slabs are
   1707		 * currently bad for the gfp()s.
   1708		 */
   1709		if (gfporder >= slab_max_order)
   1710			break;
   1711
   1712		/*
   1713		 * Acceptable internal fragmentation?
   1714		 */
   1715		if (left_over * 8 <= (PAGE_SIZE << gfporder))
   1716			break;
   1717	}
   1718	return left_over;
   1719}
   1720
   1721static struct array_cache __percpu *alloc_kmem_cache_cpus(
   1722		struct kmem_cache *cachep, int entries, int batchcount)
   1723{
   1724	int cpu;
   1725	size_t size;
   1726	struct array_cache __percpu *cpu_cache;
   1727
   1728	size = sizeof(void *) * entries + sizeof(struct array_cache);
   1729	cpu_cache = __alloc_percpu(size, sizeof(void *));
   1730
   1731	if (!cpu_cache)
   1732		return NULL;
   1733
   1734	for_each_possible_cpu(cpu) {
   1735		init_arraycache(per_cpu_ptr(cpu_cache, cpu),
   1736				entries, batchcount);
   1737	}
   1738
   1739	return cpu_cache;
   1740}
   1741
   1742static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
   1743{
   1744	if (slab_state >= FULL)
   1745		return enable_cpucache(cachep, gfp);
   1746
   1747	cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
   1748	if (!cachep->cpu_cache)
   1749		return 1;
   1750
   1751	if (slab_state == DOWN) {
   1752		/* Creation of first cache (kmem_cache). */
   1753		set_up_node(kmem_cache, CACHE_CACHE);
   1754	} else if (slab_state == PARTIAL) {
   1755		/* For kmem_cache_node */
   1756		set_up_node(cachep, SIZE_NODE);
   1757	} else {
   1758		int node;
   1759
   1760		for_each_online_node(node) {
   1761			cachep->node[node] = kmalloc_node(
   1762				sizeof(struct kmem_cache_node), gfp, node);
   1763			BUG_ON(!cachep->node[node]);
   1764			kmem_cache_node_init(cachep->node[node]);
   1765		}
   1766	}
   1767
   1768	cachep->node[numa_mem_id()]->next_reap =
   1769			jiffies + REAPTIMEOUT_NODE +
   1770			((unsigned long)cachep) % REAPTIMEOUT_NODE;
   1771
   1772	cpu_cache_get(cachep)->avail = 0;
   1773	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
   1774	cpu_cache_get(cachep)->batchcount = 1;
   1775	cpu_cache_get(cachep)->touched = 0;
   1776	cachep->batchcount = 1;
   1777	cachep->limit = BOOT_CPUCACHE_ENTRIES;
   1778	return 0;
   1779}
   1780
   1781slab_flags_t kmem_cache_flags(unsigned int object_size,
   1782	slab_flags_t flags, const char *name)
   1783{
   1784	return flags;
   1785}
   1786
   1787struct kmem_cache *
   1788__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
   1789		   slab_flags_t flags, void (*ctor)(void *))
   1790{
   1791	struct kmem_cache *cachep;
   1792
   1793	cachep = find_mergeable(size, align, flags, name, ctor);
   1794	if (cachep) {
   1795		cachep->refcount++;
   1796
   1797		/*
   1798		 * Adjust the object sizes so that we clear
   1799		 * the complete object on kzalloc.
   1800		 */
   1801		cachep->object_size = max_t(int, cachep->object_size, size);
   1802	}
   1803	return cachep;
   1804}
   1805
   1806static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
   1807			size_t size, slab_flags_t flags)
   1808{
   1809	size_t left;
   1810
   1811	cachep->num = 0;
   1812
   1813	/*
   1814	 * If slab auto-initialization on free is enabled, store the freelist
   1815	 * off-slab, so that its contents don't end up in one of the allocated
   1816	 * objects.
   1817	 */
   1818	if (unlikely(slab_want_init_on_free(cachep)))
   1819		return false;
   1820
   1821	if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
   1822		return false;
   1823
   1824	left = calculate_slab_order(cachep, size,
   1825			flags | CFLGS_OBJFREELIST_SLAB);
   1826	if (!cachep->num)
   1827		return false;
   1828
   1829	if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
   1830		return false;
   1831
   1832	cachep->colour = left / cachep->colour_off;
   1833
   1834	return true;
   1835}
   1836
   1837static bool set_off_slab_cache(struct kmem_cache *cachep,
   1838			size_t size, slab_flags_t flags)
   1839{
   1840	size_t left;
   1841
   1842	cachep->num = 0;
   1843
   1844	/*
   1845	 * Always use on-slab management when SLAB_NOLEAKTRACE
   1846	 * to avoid recursive calls into kmemleak.
   1847	 */
   1848	if (flags & SLAB_NOLEAKTRACE)
   1849		return false;
   1850
   1851	/*
   1852	 * Size is large, assume best to place the slab management obj
   1853	 * off-slab (should allow better packing of objs).
   1854	 */
   1855	left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
   1856	if (!cachep->num)
   1857		return false;
   1858
   1859	/*
   1860	 * If the slab has been placed off-slab, and we have enough space then
   1861	 * move it on-slab. This is at the expense of any extra colouring.
   1862	 */
   1863	if (left >= cachep->num * sizeof(freelist_idx_t))
   1864		return false;
   1865
   1866	cachep->colour = left / cachep->colour_off;
   1867
   1868	return true;
   1869}
   1870
   1871static bool set_on_slab_cache(struct kmem_cache *cachep,
   1872			size_t size, slab_flags_t flags)
   1873{
   1874	size_t left;
   1875
   1876	cachep->num = 0;
   1877
   1878	left = calculate_slab_order(cachep, size, flags);
   1879	if (!cachep->num)
   1880		return false;
   1881
   1882	cachep->colour = left / cachep->colour_off;
   1883
   1884	return true;
   1885}
   1886
   1887/**
   1888 * __kmem_cache_create - Create a cache.
   1889 * @cachep: cache management descriptor
   1890 * @flags: SLAB flags
   1891 *
   1892 * Returns a ptr to the cache on success, NULL on failure.
   1893 * Cannot be called within an int, but can be interrupted.
   1894 * The @ctor is run when new pages are allocated by the cache.
   1895 *
   1896 * The flags are
   1897 *
   1898 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
   1899 * to catch references to uninitialised memory.
   1900 *
   1901 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
   1902 * for buffer overruns.
   1903 *
   1904 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
   1905 * cacheline.  This can be beneficial if you're counting cycles as closely
   1906 * as davem.
   1907 *
   1908 * Return: a pointer to the created cache or %NULL in case of error
   1909 */
   1910int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
   1911{
   1912	size_t ralign = BYTES_PER_WORD;
   1913	gfp_t gfp;
   1914	int err;
   1915	unsigned int size = cachep->size;
   1916
   1917#if DEBUG
   1918#if FORCED_DEBUG
   1919	/*
   1920	 * Enable redzoning and last user accounting, except for caches with
   1921	 * large objects, if the increased size would increase the object size
   1922	 * above the next power of two: caches with object sizes just above a
   1923	 * power of two have a significant amount of internal fragmentation.
   1924	 */
   1925	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
   1926						2 * sizeof(unsigned long long)))
   1927		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
   1928	if (!(flags & SLAB_TYPESAFE_BY_RCU))
   1929		flags |= SLAB_POISON;
   1930#endif
   1931#endif
   1932
   1933	/*
   1934	 * Check that size is in terms of words.  This is needed to avoid
   1935	 * unaligned accesses for some archs when redzoning is used, and makes
   1936	 * sure any on-slab bufctl's are also correctly aligned.
   1937	 */
   1938	size = ALIGN(size, BYTES_PER_WORD);
   1939
   1940	if (flags & SLAB_RED_ZONE) {
   1941		ralign = REDZONE_ALIGN;
   1942		/* If redzoning, ensure that the second redzone is suitably
   1943		 * aligned, by adjusting the object size accordingly. */
   1944		size = ALIGN(size, REDZONE_ALIGN);
   1945	}
   1946
   1947	/* 3) caller mandated alignment */
   1948	if (ralign < cachep->align) {
   1949		ralign = cachep->align;
   1950	}
   1951	/* disable debug if necessary */
   1952	if (ralign > __alignof__(unsigned long long))
   1953		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
   1954	/*
   1955	 * 4) Store it.
   1956	 */
   1957	cachep->align = ralign;
   1958	cachep->colour_off = cache_line_size();
   1959	/* Offset must be a multiple of the alignment. */
   1960	if (cachep->colour_off < cachep->align)
   1961		cachep->colour_off = cachep->align;
   1962
   1963	if (slab_is_available())
   1964		gfp = GFP_KERNEL;
   1965	else
   1966		gfp = GFP_NOWAIT;
   1967
   1968#if DEBUG
   1969
   1970	/*
   1971	 * Both debugging options require word-alignment which is calculated
   1972	 * into align above.
   1973	 */
   1974	if (flags & SLAB_RED_ZONE) {
   1975		/* add space for red zone words */
   1976		cachep->obj_offset += sizeof(unsigned long long);
   1977		size += 2 * sizeof(unsigned long long);
   1978	}
   1979	if (flags & SLAB_STORE_USER) {
   1980		/* user store requires one word storage behind the end of
   1981		 * the real object. But if the second red zone needs to be
   1982		 * aligned to 64 bits, we must allow that much space.
   1983		 */
   1984		if (flags & SLAB_RED_ZONE)
   1985			size += REDZONE_ALIGN;
   1986		else
   1987			size += BYTES_PER_WORD;
   1988	}
   1989#endif
   1990
   1991	kasan_cache_create(cachep, &size, &flags);
   1992
   1993	size = ALIGN(size, cachep->align);
   1994	/*
   1995	 * We should restrict the number of objects in a slab to implement
   1996	 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
   1997	 */
   1998	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
   1999		size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
   2000
   2001#if DEBUG
   2002	/*
   2003	 * To activate debug pagealloc, off-slab management is necessary
   2004	 * requirement. In early phase of initialization, small sized slab
   2005	 * doesn't get initialized so it would not be possible. So, we need
   2006	 * to check size >= 256. It guarantees that all necessary small
   2007	 * sized slab is initialized in current slab initialization sequence.
   2008	 */
   2009	if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) &&
   2010		size >= 256 && cachep->object_size > cache_line_size()) {
   2011		if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
   2012			size_t tmp_size = ALIGN(size, PAGE_SIZE);
   2013
   2014			if (set_off_slab_cache(cachep, tmp_size, flags)) {
   2015				flags |= CFLGS_OFF_SLAB;
   2016				cachep->obj_offset += tmp_size - size;
   2017				size = tmp_size;
   2018				goto done;
   2019			}
   2020		}
   2021	}
   2022#endif
   2023
   2024	if (set_objfreelist_slab_cache(cachep, size, flags)) {
   2025		flags |= CFLGS_OBJFREELIST_SLAB;
   2026		goto done;
   2027	}
   2028
   2029	if (set_off_slab_cache(cachep, size, flags)) {
   2030		flags |= CFLGS_OFF_SLAB;
   2031		goto done;
   2032	}
   2033
   2034	if (set_on_slab_cache(cachep, size, flags))
   2035		goto done;
   2036
   2037	return -E2BIG;
   2038
   2039done:
   2040	cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
   2041	cachep->flags = flags;
   2042	cachep->allocflags = __GFP_COMP;
   2043	if (flags & SLAB_CACHE_DMA)
   2044		cachep->allocflags |= GFP_DMA;
   2045	if (flags & SLAB_CACHE_DMA32)
   2046		cachep->allocflags |= GFP_DMA32;
   2047	if (flags & SLAB_RECLAIM_ACCOUNT)
   2048		cachep->allocflags |= __GFP_RECLAIMABLE;
   2049	cachep->size = size;
   2050	cachep->reciprocal_buffer_size = reciprocal_value(size);
   2051
   2052#if DEBUG
   2053	/*
   2054	 * If we're going to use the generic kernel_map_pages()
   2055	 * poisoning, then it's going to smash the contents of
   2056	 * the redzone and userword anyhow, so switch them off.
   2057	 */
   2058	if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
   2059		(cachep->flags & SLAB_POISON) &&
   2060		is_debug_pagealloc_cache(cachep))
   2061		cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
   2062#endif
   2063
   2064	if (OFF_SLAB(cachep)) {
   2065		cachep->freelist_cache =
   2066			kmalloc_slab(cachep->freelist_size, 0u);
   2067	}
   2068
   2069	err = setup_cpu_cache(cachep, gfp);
   2070	if (err) {
   2071		__kmem_cache_release(cachep);
   2072		return err;
   2073	}
   2074
   2075	return 0;
   2076}
   2077
   2078#if DEBUG
   2079static void check_irq_off(void)
   2080{
   2081	BUG_ON(!irqs_disabled());
   2082}
   2083
   2084static void check_irq_on(void)
   2085{
   2086	BUG_ON(irqs_disabled());
   2087}
   2088
   2089static void check_mutex_acquired(void)
   2090{
   2091	BUG_ON(!mutex_is_locked(&slab_mutex));
   2092}
   2093
   2094static void check_spinlock_acquired(struct kmem_cache *cachep)
   2095{
   2096#ifdef CONFIG_SMP
   2097	check_irq_off();
   2098	assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
   2099#endif
   2100}
   2101
   2102static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
   2103{
   2104#ifdef CONFIG_SMP
   2105	check_irq_off();
   2106	assert_spin_locked(&get_node(cachep, node)->list_lock);
   2107#endif
   2108}
   2109
   2110#else
   2111#define check_irq_off()	do { } while(0)
   2112#define check_irq_on()	do { } while(0)
   2113#define check_mutex_acquired()	do { } while(0)
   2114#define check_spinlock_acquired(x) do { } while(0)
   2115#define check_spinlock_acquired_node(x, y) do { } while(0)
   2116#endif
   2117
   2118static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
   2119				int node, bool free_all, struct list_head *list)
   2120{
   2121	int tofree;
   2122
   2123	if (!ac || !ac->avail)
   2124		return;
   2125
   2126	tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
   2127	if (tofree > ac->avail)
   2128		tofree = (ac->avail + 1) / 2;
   2129
   2130	free_block(cachep, ac->entry, tofree, node, list);
   2131	ac->avail -= tofree;
   2132	memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail);
   2133}
   2134
   2135static void do_drain(void *arg)
   2136{
   2137	struct kmem_cache *cachep = arg;
   2138	struct array_cache *ac;
   2139	int node = numa_mem_id();
   2140	struct kmem_cache_node *n;
   2141	LIST_HEAD(list);
   2142
   2143	check_irq_off();
   2144	ac = cpu_cache_get(cachep);
   2145	n = get_node(cachep, node);
   2146	spin_lock(&n->list_lock);
   2147	free_block(cachep, ac->entry, ac->avail, node, &list);
   2148	spin_unlock(&n->list_lock);
   2149	ac->avail = 0;
   2150	slabs_destroy(cachep, &list);
   2151}
   2152
   2153static void drain_cpu_caches(struct kmem_cache *cachep)
   2154{
   2155	struct kmem_cache_node *n;
   2156	int node;
   2157	LIST_HEAD(list);
   2158
   2159	on_each_cpu(do_drain, cachep, 1);
   2160	check_irq_on();
   2161	for_each_kmem_cache_node(cachep, node, n)
   2162		if (n->alien)
   2163			drain_alien_cache(cachep, n->alien);
   2164
   2165	for_each_kmem_cache_node(cachep, node, n) {
   2166		spin_lock_irq(&n->list_lock);
   2167		drain_array_locked(cachep, n->shared, node, true, &list);
   2168		spin_unlock_irq(&n->list_lock);
   2169
   2170		slabs_destroy(cachep, &list);
   2171	}
   2172}
   2173
   2174/*
   2175 * Remove slabs from the list of free slabs.
   2176 * Specify the number of slabs to drain in tofree.
   2177 *
   2178 * Returns the actual number of slabs released.
   2179 */
   2180static int drain_freelist(struct kmem_cache *cache,
   2181			struct kmem_cache_node *n, int tofree)
   2182{
   2183	struct list_head *p;
   2184	int nr_freed;
   2185	struct slab *slab;
   2186
   2187	nr_freed = 0;
   2188	while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
   2189
   2190		spin_lock_irq(&n->list_lock);
   2191		p = n->slabs_free.prev;
   2192		if (p == &n->slabs_free) {
   2193			spin_unlock_irq(&n->list_lock);
   2194			goto out;
   2195		}
   2196
   2197		slab = list_entry(p, struct slab, slab_list);
   2198		list_del(&slab->slab_list);
   2199		n->free_slabs--;
   2200		n->total_slabs--;
   2201		/*
   2202		 * Safe to drop the lock. The slab is no longer linked
   2203		 * to the cache.
   2204		 */
   2205		n->free_objects -= cache->num;
   2206		spin_unlock_irq(&n->list_lock);
   2207		slab_destroy(cache, slab);
   2208		nr_freed++;
   2209	}
   2210out:
   2211	return nr_freed;
   2212}
   2213
   2214bool __kmem_cache_empty(struct kmem_cache *s)
   2215{
   2216	int node;
   2217	struct kmem_cache_node *n;
   2218
   2219	for_each_kmem_cache_node(s, node, n)
   2220		if (!list_empty(&n->slabs_full) ||
   2221		    !list_empty(&n->slabs_partial))
   2222			return false;
   2223	return true;
   2224}
   2225
   2226int __kmem_cache_shrink(struct kmem_cache *cachep)
   2227{
   2228	int ret = 0;
   2229	int node;
   2230	struct kmem_cache_node *n;
   2231
   2232	drain_cpu_caches(cachep);
   2233
   2234	check_irq_on();
   2235	for_each_kmem_cache_node(cachep, node, n) {
   2236		drain_freelist(cachep, n, INT_MAX);
   2237
   2238		ret += !list_empty(&n->slabs_full) ||
   2239			!list_empty(&n->slabs_partial);
   2240	}
   2241	return (ret ? 1 : 0);
   2242}
   2243
   2244int __kmem_cache_shutdown(struct kmem_cache *cachep)
   2245{
   2246	return __kmem_cache_shrink(cachep);
   2247}
   2248
   2249void __kmem_cache_release(struct kmem_cache *cachep)
   2250{
   2251	int i;
   2252	struct kmem_cache_node *n;
   2253
   2254	cache_random_seq_destroy(cachep);
   2255
   2256	free_percpu(cachep->cpu_cache);
   2257
   2258	/* NUMA: free the node structures */
   2259	for_each_kmem_cache_node(cachep, i, n) {
   2260		kfree(n->shared);
   2261		free_alien_cache(n->alien);
   2262		kfree(n);
   2263		cachep->node[i] = NULL;
   2264	}
   2265}
   2266
   2267/*
   2268 * Get the memory for a slab management obj.
   2269 *
   2270 * For a slab cache when the slab descriptor is off-slab, the
   2271 * slab descriptor can't come from the same cache which is being created,
   2272 * Because if it is the case, that means we defer the creation of
   2273 * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
   2274 * And we eventually call down to __kmem_cache_create(), which
   2275 * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one.
   2276 * This is a "chicken-and-egg" problem.
   2277 *
   2278 * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
   2279 * which are all initialized during kmem_cache_init().
   2280 */
   2281static void *alloc_slabmgmt(struct kmem_cache *cachep,
   2282				   struct slab *slab, int colour_off,
   2283				   gfp_t local_flags, int nodeid)
   2284{
   2285	void *freelist;
   2286	void *addr = slab_address(slab);
   2287
   2288	slab->s_mem = addr + colour_off;
   2289	slab->active = 0;
   2290
   2291	if (OBJFREELIST_SLAB(cachep))
   2292		freelist = NULL;
   2293	else if (OFF_SLAB(cachep)) {
   2294		/* Slab management obj is off-slab. */
   2295		freelist = kmem_cache_alloc_node(cachep->freelist_cache,
   2296					      local_flags, nodeid);
   2297	} else {
   2298		/* We will use last bytes at the slab for freelist */
   2299		freelist = addr + (PAGE_SIZE << cachep->gfporder) -
   2300				cachep->freelist_size;
   2301	}
   2302
   2303	return freelist;
   2304}
   2305
   2306static inline freelist_idx_t get_free_obj(struct slab *slab, unsigned int idx)
   2307{
   2308	return ((freelist_idx_t *) slab->freelist)[idx];
   2309}
   2310
   2311static inline void set_free_obj(struct slab *slab,
   2312					unsigned int idx, freelist_idx_t val)
   2313{
   2314	((freelist_idx_t *)(slab->freelist))[idx] = val;
   2315}
   2316
   2317static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab)
   2318{
   2319#if DEBUG
   2320	int i;
   2321
   2322	for (i = 0; i < cachep->num; i++) {
   2323		void *objp = index_to_obj(cachep, slab, i);
   2324
   2325		if (cachep->flags & SLAB_STORE_USER)
   2326			*dbg_userword(cachep, objp) = NULL;
   2327
   2328		if (cachep->flags & SLAB_RED_ZONE) {
   2329			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
   2330			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
   2331		}
   2332		/*
   2333		 * Constructors are not allowed to allocate memory from the same
   2334		 * cache which they are a constructor for.  Otherwise, deadlock.
   2335		 * They must also be threaded.
   2336		 */
   2337		if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
   2338			kasan_unpoison_object_data(cachep,
   2339						   objp + obj_offset(cachep));
   2340			cachep->ctor(objp + obj_offset(cachep));
   2341			kasan_poison_object_data(
   2342				cachep, objp + obj_offset(cachep));
   2343		}
   2344
   2345		if (cachep->flags & SLAB_RED_ZONE) {
   2346			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
   2347				slab_error(cachep, "constructor overwrote the end of an object");
   2348			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
   2349				slab_error(cachep, "constructor overwrote the start of an object");
   2350		}
   2351		/* need to poison the objs? */
   2352		if (cachep->flags & SLAB_POISON) {
   2353			poison_obj(cachep, objp, POISON_FREE);
   2354			slab_kernel_map(cachep, objp, 0);
   2355		}
   2356	}
   2357#endif
   2358}
   2359
   2360#ifdef CONFIG_SLAB_FREELIST_RANDOM
   2361/* Hold information during a freelist initialization */
   2362union freelist_init_state {
   2363	struct {
   2364		unsigned int pos;
   2365		unsigned int *list;
   2366		unsigned int count;
   2367	};
   2368	struct rnd_state rnd_state;
   2369};
   2370
   2371/*
   2372 * Initialize the state based on the randomization method available.
   2373 * return true if the pre-computed list is available, false otherwise.
   2374 */
   2375static bool freelist_state_initialize(union freelist_init_state *state,
   2376				struct kmem_cache *cachep,
   2377				unsigned int count)
   2378{
   2379	bool ret;
   2380	unsigned int rand;
   2381
   2382	/* Use best entropy available to define a random shift */
   2383	rand = get_random_int();
   2384
   2385	/* Use a random state if the pre-computed list is not available */
   2386	if (!cachep->random_seq) {
   2387		prandom_seed_state(&state->rnd_state, rand);
   2388		ret = false;
   2389	} else {
   2390		state->list = cachep->random_seq;
   2391		state->count = count;
   2392		state->pos = rand % count;
   2393		ret = true;
   2394	}
   2395	return ret;
   2396}
   2397
   2398/* Get the next entry on the list and randomize it using a random shift */
   2399static freelist_idx_t next_random_slot(union freelist_init_state *state)
   2400{
   2401	if (state->pos >= state->count)
   2402		state->pos = 0;
   2403	return state->list[state->pos++];
   2404}
   2405
   2406/* Swap two freelist entries */
   2407static void swap_free_obj(struct slab *slab, unsigned int a, unsigned int b)
   2408{
   2409	swap(((freelist_idx_t *) slab->freelist)[a],
   2410		((freelist_idx_t *) slab->freelist)[b]);
   2411}
   2412
   2413/*
   2414 * Shuffle the freelist initialization state based on pre-computed lists.
   2415 * return true if the list was successfully shuffled, false otherwise.
   2416 */
   2417static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab)
   2418{
   2419	unsigned int objfreelist = 0, i, rand, count = cachep->num;
   2420	union freelist_init_state state;
   2421	bool precomputed;
   2422
   2423	if (count < 2)
   2424		return false;
   2425
   2426	precomputed = freelist_state_initialize(&state, cachep, count);
   2427
   2428	/* Take a random entry as the objfreelist */
   2429	if (OBJFREELIST_SLAB(cachep)) {
   2430		if (!precomputed)
   2431			objfreelist = count - 1;
   2432		else
   2433			objfreelist = next_random_slot(&state);
   2434		slab->freelist = index_to_obj(cachep, slab, objfreelist) +
   2435						obj_offset(cachep);
   2436		count--;
   2437	}
   2438
   2439	/*
   2440	 * On early boot, generate the list dynamically.
   2441	 * Later use a pre-computed list for speed.
   2442	 */
   2443	if (!precomputed) {
   2444		for (i = 0; i < count; i++)
   2445			set_free_obj(slab, i, i);
   2446
   2447		/* Fisher-Yates shuffle */
   2448		for (i = count - 1; i > 0; i--) {
   2449			rand = prandom_u32_state(&state.rnd_state);
   2450			rand %= (i + 1);
   2451			swap_free_obj(slab, i, rand);
   2452		}
   2453	} else {
   2454		for (i = 0; i < count; i++)
   2455			set_free_obj(slab, i, next_random_slot(&state));
   2456	}
   2457
   2458	if (OBJFREELIST_SLAB(cachep))
   2459		set_free_obj(slab, cachep->num - 1, objfreelist);
   2460
   2461	return true;
   2462}
   2463#else
   2464static inline bool shuffle_freelist(struct kmem_cache *cachep,
   2465				struct slab *slab)
   2466{
   2467	return false;
   2468}
   2469#endif /* CONFIG_SLAB_FREELIST_RANDOM */
   2470
   2471static void cache_init_objs(struct kmem_cache *cachep,
   2472			    struct slab *slab)
   2473{
   2474	int i;
   2475	void *objp;
   2476	bool shuffled;
   2477
   2478	cache_init_objs_debug(cachep, slab);
   2479
   2480	/* Try to randomize the freelist if enabled */
   2481	shuffled = shuffle_freelist(cachep, slab);
   2482
   2483	if (!shuffled && OBJFREELIST_SLAB(cachep)) {
   2484		slab->freelist = index_to_obj(cachep, slab, cachep->num - 1) +
   2485						obj_offset(cachep);
   2486	}
   2487
   2488	for (i = 0; i < cachep->num; i++) {
   2489		objp = index_to_obj(cachep, slab, i);
   2490		objp = kasan_init_slab_obj(cachep, objp);
   2491
   2492		/* constructor could break poison info */
   2493		if (DEBUG == 0 && cachep->ctor) {
   2494			kasan_unpoison_object_data(cachep, objp);
   2495			cachep->ctor(objp);
   2496			kasan_poison_object_data(cachep, objp);
   2497		}
   2498
   2499		if (!shuffled)
   2500			set_free_obj(slab, i, i);
   2501	}
   2502}
   2503
   2504static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slab)
   2505{
   2506	void *objp;
   2507
   2508	objp = index_to_obj(cachep, slab, get_free_obj(slab, slab->active));
   2509	slab->active++;
   2510
   2511	return objp;
   2512}
   2513
   2514static void slab_put_obj(struct kmem_cache *cachep,
   2515			struct slab *slab, void *objp)
   2516{
   2517	unsigned int objnr = obj_to_index(cachep, slab, objp);
   2518#if DEBUG
   2519	unsigned int i;
   2520
   2521	/* Verify double free bug */
   2522	for (i = slab->active; i < cachep->num; i++) {
   2523		if (get_free_obj(slab, i) == objnr) {
   2524			pr_err("slab: double free detected in cache '%s', objp %px\n",
   2525			       cachep->name, objp);
   2526			BUG();
   2527		}
   2528	}
   2529#endif
   2530	slab->active--;
   2531	if (!slab->freelist)
   2532		slab->freelist = objp + obj_offset(cachep);
   2533
   2534	set_free_obj(slab, slab->active, objnr);
   2535}
   2536
   2537/*
   2538 * Grow (by 1) the number of slabs within a cache.  This is called by
   2539 * kmem_cache_alloc() when there are no active objs left in a cache.
   2540 */
   2541static struct slab *cache_grow_begin(struct kmem_cache *cachep,
   2542				gfp_t flags, int nodeid)
   2543{
   2544	void *freelist;
   2545	size_t offset;
   2546	gfp_t local_flags;
   2547	int slab_node;
   2548	struct kmem_cache_node *n;
   2549	struct slab *slab;
   2550
   2551	/*
   2552	 * Be lazy and only check for valid flags here,  keeping it out of the
   2553	 * critical path in kmem_cache_alloc().
   2554	 */
   2555	if (unlikely(flags & GFP_SLAB_BUG_MASK))
   2556		flags = kmalloc_fix_flags(flags);
   2557
   2558	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
   2559	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
   2560
   2561	check_irq_off();
   2562	if (gfpflags_allow_blocking(local_flags))
   2563		local_irq_enable();
   2564
   2565	/*
   2566	 * Get mem for the objs.  Attempt to allocate a physical page from
   2567	 * 'nodeid'.
   2568	 */
   2569	slab = kmem_getpages(cachep, local_flags, nodeid);
   2570	if (!slab)
   2571		goto failed;
   2572
   2573	slab_node = slab_nid(slab);
   2574	n = get_node(cachep, slab_node);
   2575
   2576	/* Get colour for the slab, and cal the next value. */
   2577	n->colour_next++;
   2578	if (n->colour_next >= cachep->colour)
   2579		n->colour_next = 0;
   2580
   2581	offset = n->colour_next;
   2582	if (offset >= cachep->colour)
   2583		offset = 0;
   2584
   2585	offset *= cachep->colour_off;
   2586
   2587	/*
   2588	 * Call kasan_poison_slab() before calling alloc_slabmgmt(), so
   2589	 * page_address() in the latter returns a non-tagged pointer,
   2590	 * as it should be for slab pages.
   2591	 */
   2592	kasan_poison_slab(slab);
   2593
   2594	/* Get slab management. */
   2595	freelist = alloc_slabmgmt(cachep, slab, offset,
   2596			local_flags & ~GFP_CONSTRAINT_MASK, slab_node);
   2597	if (OFF_SLAB(cachep) && !freelist)
   2598		goto opps1;
   2599
   2600	slab->slab_cache = cachep;
   2601	slab->freelist = freelist;
   2602
   2603	cache_init_objs(cachep, slab);
   2604
   2605	if (gfpflags_allow_blocking(local_flags))
   2606		local_irq_disable();
   2607
   2608	return slab;
   2609
   2610opps1:
   2611	kmem_freepages(cachep, slab);
   2612failed:
   2613	if (gfpflags_allow_blocking(local_flags))
   2614		local_irq_disable();
   2615	return NULL;
   2616}
   2617
   2618static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
   2619{
   2620	struct kmem_cache_node *n;
   2621	void *list = NULL;
   2622
   2623	check_irq_off();
   2624
   2625	if (!slab)
   2626		return;
   2627
   2628	INIT_LIST_HEAD(&slab->slab_list);
   2629	n = get_node(cachep, slab_nid(slab));
   2630
   2631	spin_lock(&n->list_lock);
   2632	n->total_slabs++;
   2633	if (!slab->active) {
   2634		list_add_tail(&slab->slab_list, &n->slabs_free);
   2635		n->free_slabs++;
   2636	} else
   2637		fixup_slab_list(cachep, n, slab, &list);
   2638
   2639	STATS_INC_GROWN(cachep);
   2640	n->free_objects += cachep->num - slab->active;
   2641	spin_unlock(&n->list_lock);
   2642
   2643	fixup_objfreelist_debug(cachep, &list);
   2644}
   2645
   2646#if DEBUG
   2647
   2648/*
   2649 * Perform extra freeing checks:
   2650 * - detect bad pointers.
   2651 * - POISON/RED_ZONE checking
   2652 */
   2653static void kfree_debugcheck(const void *objp)
   2654{
   2655	if (!virt_addr_valid(objp)) {
   2656		pr_err("kfree_debugcheck: out of range ptr %lxh\n",
   2657		       (unsigned long)objp);
   2658		BUG();
   2659	}
   2660}
   2661
   2662static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
   2663{
   2664	unsigned long long redzone1, redzone2;
   2665
   2666	redzone1 = *dbg_redzone1(cache, obj);
   2667	redzone2 = *dbg_redzone2(cache, obj);
   2668
   2669	/*
   2670	 * Redzone is ok.
   2671	 */
   2672	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
   2673		return;
   2674
   2675	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
   2676		slab_error(cache, "double free detected");
   2677	else
   2678		slab_error(cache, "memory outside object was overwritten");
   2679
   2680	pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
   2681	       obj, redzone1, redzone2);
   2682}
   2683
   2684static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
   2685				   unsigned long caller)
   2686{
   2687	unsigned int objnr;
   2688	struct slab *slab;
   2689
   2690	BUG_ON(virt_to_cache(objp) != cachep);
   2691
   2692	objp -= obj_offset(cachep);
   2693	kfree_debugcheck(objp);
   2694	slab = virt_to_slab(objp);
   2695
   2696	if (cachep->flags & SLAB_RED_ZONE) {
   2697		verify_redzone_free(cachep, objp);
   2698		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
   2699		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
   2700	}
   2701	if (cachep->flags & SLAB_STORE_USER)
   2702		*dbg_userword(cachep, objp) = (void *)caller;
   2703
   2704	objnr = obj_to_index(cachep, slab, objp);
   2705
   2706	BUG_ON(objnr >= cachep->num);
   2707	BUG_ON(objp != index_to_obj(cachep, slab, objnr));
   2708
   2709	if (cachep->flags & SLAB_POISON) {
   2710		poison_obj(cachep, objp, POISON_FREE);
   2711		slab_kernel_map(cachep, objp, 0);
   2712	}
   2713	return objp;
   2714}
   2715
   2716#else
   2717#define kfree_debugcheck(x) do { } while(0)
   2718#define cache_free_debugcheck(x, objp, z) (objp)
   2719#endif
   2720
   2721static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
   2722						void **list)
   2723{
   2724#if DEBUG
   2725	void *next = *list;
   2726	void *objp;
   2727
   2728	while (next) {
   2729		objp = next - obj_offset(cachep);
   2730		next = *(void **)next;
   2731		poison_obj(cachep, objp, POISON_FREE);
   2732	}
   2733#endif
   2734}
   2735
   2736static inline void fixup_slab_list(struct kmem_cache *cachep,
   2737				struct kmem_cache_node *n, struct slab *slab,
   2738				void **list)
   2739{
   2740	/* move slabp to correct slabp list: */
   2741	list_del(&slab->slab_list);
   2742	if (slab->active == cachep->num) {
   2743		list_add(&slab->slab_list, &n->slabs_full);
   2744		if (OBJFREELIST_SLAB(cachep)) {
   2745#if DEBUG
   2746			/* Poisoning will be done without holding the lock */
   2747			if (cachep->flags & SLAB_POISON) {
   2748				void **objp = slab->freelist;
   2749
   2750				*objp = *list;
   2751				*list = objp;
   2752			}
   2753#endif
   2754			slab->freelist = NULL;
   2755		}
   2756	} else
   2757		list_add(&slab->slab_list, &n->slabs_partial);
   2758}
   2759
   2760/* Try to find non-pfmemalloc slab if needed */
   2761static noinline struct slab *get_valid_first_slab(struct kmem_cache_node *n,
   2762					struct slab *slab, bool pfmemalloc)
   2763{
   2764	if (!slab)
   2765		return NULL;
   2766
   2767	if (pfmemalloc)
   2768		return slab;
   2769
   2770	if (!slab_test_pfmemalloc(slab))
   2771		return slab;
   2772
   2773	/* No need to keep pfmemalloc slab if we have enough free objects */
   2774	if (n->free_objects > n->free_limit) {
   2775		slab_clear_pfmemalloc(slab);
   2776		return slab;
   2777	}
   2778
   2779	/* Move pfmemalloc slab to the end of list to speed up next search */
   2780	list_del(&slab->slab_list);
   2781	if (!slab->active) {
   2782		list_add_tail(&slab->slab_list, &n->slabs_free);
   2783		n->free_slabs++;
   2784	} else
   2785		list_add_tail(&slab->slab_list, &n->slabs_partial);
   2786
   2787	list_for_each_entry(slab, &n->slabs_partial, slab_list) {
   2788		if (!slab_test_pfmemalloc(slab))
   2789			return slab;
   2790	}
   2791
   2792	n->free_touched = 1;
   2793	list_for_each_entry(slab, &n->slabs_free, slab_list) {
   2794		if (!slab_test_pfmemalloc(slab)) {
   2795			n->free_slabs--;
   2796			return slab;
   2797		}
   2798	}
   2799
   2800	return NULL;
   2801}
   2802
   2803static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
   2804{
   2805	struct slab *slab;
   2806
   2807	assert_spin_locked(&n->list_lock);
   2808	slab = list_first_entry_or_null(&n->slabs_partial, struct slab,
   2809					slab_list);
   2810	if (!slab) {
   2811		n->free_touched = 1;
   2812		slab = list_first_entry_or_null(&n->slabs_free, struct slab,
   2813						slab_list);
   2814		if (slab)
   2815			n->free_slabs--;
   2816	}
   2817
   2818	if (sk_memalloc_socks())
   2819		slab = get_valid_first_slab(n, slab, pfmemalloc);
   2820
   2821	return slab;
   2822}
   2823
   2824static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
   2825				struct kmem_cache_node *n, gfp_t flags)
   2826{
   2827	struct slab *slab;
   2828	void *obj;
   2829	void *list = NULL;
   2830
   2831	if (!gfp_pfmemalloc_allowed(flags))
   2832		return NULL;
   2833
   2834	spin_lock(&n->list_lock);
   2835	slab = get_first_slab(n, true);
   2836	if (!slab) {
   2837		spin_unlock(&n->list_lock);
   2838		return NULL;
   2839	}
   2840
   2841	obj = slab_get_obj(cachep, slab);
   2842	n->free_objects--;
   2843
   2844	fixup_slab_list(cachep, n, slab, &list);
   2845
   2846	spin_unlock(&n->list_lock);
   2847	fixup_objfreelist_debug(cachep, &list);
   2848
   2849	return obj;
   2850}
   2851
   2852/*
   2853 * Slab list should be fixed up by fixup_slab_list() for existing slab
   2854 * or cache_grow_end() for new slab
   2855 */
   2856static __always_inline int alloc_block(struct kmem_cache *cachep,
   2857		struct array_cache *ac, struct slab *slab, int batchcount)
   2858{
   2859	/*
   2860	 * There must be at least one object available for
   2861	 * allocation.
   2862	 */
   2863	BUG_ON(slab->active >= cachep->num);
   2864
   2865	while (slab->active < cachep->num && batchcount--) {
   2866		STATS_INC_ALLOCED(cachep);
   2867		STATS_INC_ACTIVE(cachep);
   2868		STATS_SET_HIGH(cachep);
   2869
   2870		ac->entry[ac->avail++] = slab_get_obj(cachep, slab);
   2871	}
   2872
   2873	return batchcount;
   2874}
   2875
   2876static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
   2877{
   2878	int batchcount;
   2879	struct kmem_cache_node *n;
   2880	struct array_cache *ac, *shared;
   2881	int node;
   2882	void *list = NULL;
   2883	struct slab *slab;
   2884
   2885	check_irq_off();
   2886	node = numa_mem_id();
   2887
   2888	ac = cpu_cache_get(cachep);
   2889	batchcount = ac->batchcount;
   2890	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
   2891		/*
   2892		 * If there was little recent activity on this cache, then
   2893		 * perform only a partial refill.  Otherwise we could generate
   2894		 * refill bouncing.
   2895		 */
   2896		batchcount = BATCHREFILL_LIMIT;
   2897	}
   2898	n = get_node(cachep, node);
   2899
   2900	BUG_ON(ac->avail > 0 || !n);
   2901	shared = READ_ONCE(n->shared);
   2902	if (!n->free_objects && (!shared || !shared->avail))
   2903		goto direct_grow;
   2904
   2905	spin_lock(&n->list_lock);
   2906	shared = READ_ONCE(n->shared);
   2907
   2908	/* See if we can refill from the shared array */
   2909	if (shared && transfer_objects(ac, shared, batchcount)) {
   2910		shared->touched = 1;
   2911		goto alloc_done;
   2912	}
   2913
   2914	while (batchcount > 0) {
   2915		/* Get slab alloc is to come from. */
   2916		slab = get_first_slab(n, false);
   2917		if (!slab)
   2918			goto must_grow;
   2919
   2920		check_spinlock_acquired(cachep);
   2921
   2922		batchcount = alloc_block(cachep, ac, slab, batchcount);
   2923		fixup_slab_list(cachep, n, slab, &list);
   2924	}
   2925
   2926must_grow:
   2927	n->free_objects -= ac->avail;
   2928alloc_done:
   2929	spin_unlock(&n->list_lock);
   2930	fixup_objfreelist_debug(cachep, &list);
   2931
   2932direct_grow:
   2933	if (unlikely(!ac->avail)) {
   2934		/* Check if we can use obj in pfmemalloc slab */
   2935		if (sk_memalloc_socks()) {
   2936			void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
   2937
   2938			if (obj)
   2939				return obj;
   2940		}
   2941
   2942		slab = cache_grow_begin(cachep, gfp_exact_node(flags), node);
   2943
   2944		/*
   2945		 * cache_grow_begin() can reenable interrupts,
   2946		 * then ac could change.
   2947		 */
   2948		ac = cpu_cache_get(cachep);
   2949		if (!ac->avail && slab)
   2950			alloc_block(cachep, ac, slab, batchcount);
   2951		cache_grow_end(cachep, slab);
   2952
   2953		if (!ac->avail)
   2954			return NULL;
   2955	}
   2956	ac->touched = 1;
   2957
   2958	return ac->entry[--ac->avail];
   2959}
   2960
   2961static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
   2962						gfp_t flags)
   2963{
   2964	might_sleep_if(gfpflags_allow_blocking(flags));
   2965}
   2966
   2967#if DEBUG
   2968static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
   2969				gfp_t flags, void *objp, unsigned long caller)
   2970{
   2971	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
   2972	if (!objp || is_kfence_address(objp))
   2973		return objp;
   2974	if (cachep->flags & SLAB_POISON) {
   2975		check_poison_obj(cachep, objp);
   2976		slab_kernel_map(cachep, objp, 1);
   2977		poison_obj(cachep, objp, POISON_INUSE);
   2978	}
   2979	if (cachep->flags & SLAB_STORE_USER)
   2980		*dbg_userword(cachep, objp) = (void *)caller;
   2981
   2982	if (cachep->flags & SLAB_RED_ZONE) {
   2983		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
   2984				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
   2985			slab_error(cachep, "double free, or memory outside object was overwritten");
   2986			pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
   2987			       objp, *dbg_redzone1(cachep, objp),
   2988			       *dbg_redzone2(cachep, objp));
   2989		}
   2990		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
   2991		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
   2992	}
   2993
   2994	objp += obj_offset(cachep);
   2995	if (cachep->ctor && cachep->flags & SLAB_POISON)
   2996		cachep->ctor(objp);
   2997	if ((unsigned long)objp & (arch_slab_minalign() - 1)) {
   2998		pr_err("0x%px: not aligned to arch_slab_minalign()=%u\n", objp,
   2999		       arch_slab_minalign());
   3000	}
   3001	return objp;
   3002}
   3003#else
   3004#define cache_alloc_debugcheck_after(a, b, objp, d) (objp)
   3005#endif
   3006
   3007static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
   3008{
   3009	void *objp;
   3010	struct array_cache *ac;
   3011
   3012	check_irq_off();
   3013
   3014	ac = cpu_cache_get(cachep);
   3015	if (likely(ac->avail)) {
   3016		ac->touched = 1;
   3017		objp = ac->entry[--ac->avail];
   3018
   3019		STATS_INC_ALLOCHIT(cachep);
   3020		goto out;
   3021	}
   3022
   3023	STATS_INC_ALLOCMISS(cachep);
   3024	objp = cache_alloc_refill(cachep, flags);
   3025	/*
   3026	 * the 'ac' may be updated by cache_alloc_refill(),
   3027	 * and kmemleak_erase() requires its correct value.
   3028	 */
   3029	ac = cpu_cache_get(cachep);
   3030
   3031out:
   3032	/*
   3033	 * To avoid a false negative, if an object that is in one of the
   3034	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
   3035	 * treat the array pointers as a reference to the object.
   3036	 */
   3037	if (objp)
   3038		kmemleak_erase(&ac->entry[ac->avail]);
   3039	return objp;
   3040}
   3041
   3042#ifdef CONFIG_NUMA
   3043static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
   3044
   3045/*
   3046 * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
   3047 *
   3048 * If we are in_interrupt, then process context, including cpusets and
   3049 * mempolicy, may not apply and should not be used for allocation policy.
   3050 */
   3051static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
   3052{
   3053	int nid_alloc, nid_here;
   3054
   3055	if (in_interrupt() || (flags & __GFP_THISNODE))
   3056		return NULL;
   3057	nid_alloc = nid_here = numa_mem_id();
   3058	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
   3059		nid_alloc = cpuset_slab_spread_node();
   3060	else if (current->mempolicy)
   3061		nid_alloc = mempolicy_slab_node();
   3062	if (nid_alloc != nid_here)
   3063		return ____cache_alloc_node(cachep, flags, nid_alloc);
   3064	return NULL;
   3065}
   3066
   3067/*
   3068 * Fallback function if there was no memory available and no objects on a
   3069 * certain node and fall back is permitted. First we scan all the
   3070 * available node for available objects. If that fails then we
   3071 * perform an allocation without specifying a node. This allows the page
   3072 * allocator to do its reclaim / fallback magic. We then insert the
   3073 * slab into the proper nodelist and then allocate from it.
   3074 */
   3075static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
   3076{
   3077	struct zonelist *zonelist;
   3078	struct zoneref *z;
   3079	struct zone *zone;
   3080	enum zone_type highest_zoneidx = gfp_zone(flags);
   3081	void *obj = NULL;
   3082	struct slab *slab;
   3083	int nid;
   3084	unsigned int cpuset_mems_cookie;
   3085
   3086	if (flags & __GFP_THISNODE)
   3087		return NULL;
   3088
   3089retry_cpuset:
   3090	cpuset_mems_cookie = read_mems_allowed_begin();
   3091	zonelist = node_zonelist(mempolicy_slab_node(), flags);
   3092
   3093retry:
   3094	/*
   3095	 * Look through allowed nodes for objects available
   3096	 * from existing per node queues.
   3097	 */
   3098	for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
   3099		nid = zone_to_nid(zone);
   3100
   3101		if (cpuset_zone_allowed(zone, flags) &&
   3102			get_node(cache, nid) &&
   3103			get_node(cache, nid)->free_objects) {
   3104				obj = ____cache_alloc_node(cache,
   3105					gfp_exact_node(flags), nid);
   3106				if (obj)
   3107					break;
   3108		}
   3109	}
   3110
   3111	if (!obj) {
   3112		/*
   3113		 * This allocation will be performed within the constraints
   3114		 * of the current cpuset / memory policy requirements.
   3115		 * We may trigger various forms of reclaim on the allowed
   3116		 * set and go into memory reserves if necessary.
   3117		 */
   3118		slab = cache_grow_begin(cache, flags, numa_mem_id());
   3119		cache_grow_end(cache, slab);
   3120		if (slab) {
   3121			nid = slab_nid(slab);
   3122			obj = ____cache_alloc_node(cache,
   3123				gfp_exact_node(flags), nid);
   3124
   3125			/*
   3126			 * Another processor may allocate the objects in
   3127			 * the slab since we are not holding any locks.
   3128			 */
   3129			if (!obj)
   3130				goto retry;
   3131		}
   3132	}
   3133
   3134	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
   3135		goto retry_cpuset;
   3136	return obj;
   3137}
   3138
   3139/*
   3140 * An interface to enable slab creation on nodeid
   3141 */
   3142static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
   3143				int nodeid)
   3144{
   3145	struct slab *slab;
   3146	struct kmem_cache_node *n;
   3147	void *obj = NULL;
   3148	void *list = NULL;
   3149
   3150	VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
   3151	n = get_node(cachep, nodeid);
   3152	BUG_ON(!n);
   3153
   3154	check_irq_off();
   3155	spin_lock(&n->list_lock);
   3156	slab = get_first_slab(n, false);
   3157	if (!slab)
   3158		goto must_grow;
   3159
   3160	check_spinlock_acquired_node(cachep, nodeid);
   3161
   3162	STATS_INC_NODEALLOCS(cachep);
   3163	STATS_INC_ACTIVE(cachep);
   3164	STATS_SET_HIGH(cachep);
   3165
   3166	BUG_ON(slab->active == cachep->num);
   3167
   3168	obj = slab_get_obj(cachep, slab);
   3169	n->free_objects--;
   3170
   3171	fixup_slab_list(cachep, n, slab, &list);
   3172
   3173	spin_unlock(&n->list_lock);
   3174	fixup_objfreelist_debug(cachep, &list);
   3175	return obj;
   3176
   3177must_grow:
   3178	spin_unlock(&n->list_lock);
   3179	slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
   3180	if (slab) {
   3181		/* This slab isn't counted yet so don't update free_objects */
   3182		obj = slab_get_obj(cachep, slab);
   3183	}
   3184	cache_grow_end(cachep, slab);
   3185
   3186	return obj ? obj : fallback_alloc(cachep, flags);
   3187}
   3188
   3189static __always_inline void *
   3190slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
   3191		   unsigned long caller)
   3192{
   3193	unsigned long save_flags;
   3194	void *ptr;
   3195	int slab_node = numa_mem_id();
   3196	struct obj_cgroup *objcg = NULL;
   3197	bool init = false;
   3198
   3199	flags &= gfp_allowed_mask;
   3200	cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags);
   3201	if (unlikely(!cachep))
   3202		return NULL;
   3203
   3204	ptr = kfence_alloc(cachep, orig_size, flags);
   3205	if (unlikely(ptr))
   3206		goto out_hooks;
   3207
   3208	cache_alloc_debugcheck_before(cachep, flags);
   3209	local_irq_save(save_flags);
   3210
   3211	if (nodeid == NUMA_NO_NODE)
   3212		nodeid = slab_node;
   3213
   3214	if (unlikely(!get_node(cachep, nodeid))) {
   3215		/* Node not bootstrapped yet */
   3216		ptr = fallback_alloc(cachep, flags);
   3217		goto out;
   3218	}
   3219
   3220	if (nodeid == slab_node) {
   3221		/*
   3222		 * Use the locally cached objects if possible.
   3223		 * However ____cache_alloc does not allow fallback
   3224		 * to other nodes. It may fail while we still have
   3225		 * objects on other nodes available.
   3226		 */
   3227		ptr = ____cache_alloc(cachep, flags);
   3228		if (ptr)
   3229			goto out;
   3230	}
   3231	/* ___cache_alloc_node can fall back to other nodes */
   3232	ptr = ____cache_alloc_node(cachep, flags, nodeid);
   3233  out:
   3234	local_irq_restore(save_flags);
   3235	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
   3236	init = slab_want_init_on_alloc(flags, cachep);
   3237
   3238out_hooks:
   3239	slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
   3240	return ptr;
   3241}
   3242
   3243static __always_inline void *
   3244__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
   3245{
   3246	void *objp;
   3247
   3248	if (current->mempolicy || cpuset_do_slab_mem_spread()) {
   3249		objp = alternate_node_alloc(cache, flags);
   3250		if (objp)
   3251			goto out;
   3252	}
   3253	objp = ____cache_alloc(cache, flags);
   3254
   3255	/*
   3256	 * We may just have run out of memory on the local node.
   3257	 * ____cache_alloc_node() knows how to locate memory on other nodes
   3258	 */
   3259	if (!objp)
   3260		objp = ____cache_alloc_node(cache, flags, numa_mem_id());
   3261
   3262  out:
   3263	return objp;
   3264}
   3265#else
   3266
   3267static __always_inline void *
   3268__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
   3269{
   3270	return ____cache_alloc(cachep, flags);
   3271}
   3272
   3273#endif /* CONFIG_NUMA */
   3274
   3275static __always_inline void *
   3276slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
   3277	   size_t orig_size, unsigned long caller)
   3278{
   3279	unsigned long save_flags;
   3280	void *objp;
   3281	struct obj_cgroup *objcg = NULL;
   3282	bool init = false;
   3283
   3284	flags &= gfp_allowed_mask;
   3285	cachep = slab_pre_alloc_hook(cachep, lru, &objcg, 1, flags);
   3286	if (unlikely(!cachep))
   3287		return NULL;
   3288
   3289	objp = kfence_alloc(cachep, orig_size, flags);
   3290	if (unlikely(objp))
   3291		goto out;
   3292
   3293	cache_alloc_debugcheck_before(cachep, flags);
   3294	local_irq_save(save_flags);
   3295	objp = __do_cache_alloc(cachep, flags);
   3296	local_irq_restore(save_flags);
   3297	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
   3298	prefetchw(objp);
   3299	init = slab_want_init_on_alloc(flags, cachep);
   3300
   3301out:
   3302	slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init);
   3303	return objp;
   3304}
   3305
   3306/*
   3307 * Caller needs to acquire correct kmem_cache_node's list_lock
   3308 * @list: List of detached free slabs should be freed by caller
   3309 */
   3310static void free_block(struct kmem_cache *cachep, void **objpp,
   3311			int nr_objects, int node, struct list_head *list)
   3312{
   3313	int i;
   3314	struct kmem_cache_node *n = get_node(cachep, node);
   3315	struct slab *slab;
   3316
   3317	n->free_objects += nr_objects;
   3318
   3319	for (i = 0; i < nr_objects; i++) {
   3320		void *objp;
   3321		struct slab *slab;
   3322
   3323		objp = objpp[i];
   3324
   3325		slab = virt_to_slab(objp);
   3326		list_del(&slab->slab_list);
   3327		check_spinlock_acquired_node(cachep, node);
   3328		slab_put_obj(cachep, slab, objp);
   3329		STATS_DEC_ACTIVE(cachep);
   3330
   3331		/* fixup slab chains */
   3332		if (slab->active == 0) {
   3333			list_add(&slab->slab_list, &n->slabs_free);
   3334			n->free_slabs++;
   3335		} else {
   3336			/* Unconditionally move a slab to the end of the
   3337			 * partial list on free - maximum time for the
   3338			 * other objects to be freed, too.
   3339			 */
   3340			list_add_tail(&slab->slab_list, &n->slabs_partial);
   3341		}
   3342	}
   3343
   3344	while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
   3345		n->free_objects -= cachep->num;
   3346
   3347		slab = list_last_entry(&n->slabs_free, struct slab, slab_list);
   3348		list_move(&slab->slab_list, list);
   3349		n->free_slabs--;
   3350		n->total_slabs--;
   3351	}
   3352}
   3353
   3354static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
   3355{
   3356	int batchcount;
   3357	struct kmem_cache_node *n;
   3358	int node = numa_mem_id();
   3359	LIST_HEAD(list);
   3360
   3361	batchcount = ac->batchcount;
   3362
   3363	check_irq_off();
   3364	n = get_node(cachep, node);
   3365	spin_lock(&n->list_lock);
   3366	if (n->shared) {
   3367		struct array_cache *shared_array = n->shared;
   3368		int max = shared_array->limit - shared_array->avail;
   3369		if (max) {
   3370			if (batchcount > max)
   3371				batchcount = max;
   3372			memcpy(&(shared_array->entry[shared_array->avail]),
   3373			       ac->entry, sizeof(void *) * batchcount);
   3374			shared_array->avail += batchcount;
   3375			goto free_done;
   3376		}
   3377	}
   3378
   3379	free_block(cachep, ac->entry, batchcount, node, &list);
   3380free_done:
   3381#if STATS
   3382	{
   3383		int i = 0;
   3384		struct slab *slab;
   3385
   3386		list_for_each_entry(slab, &n->slabs_free, slab_list) {
   3387			BUG_ON(slab->active);
   3388
   3389			i++;
   3390		}
   3391		STATS_SET_FREEABLE(cachep, i);
   3392	}
   3393#endif
   3394	spin_unlock(&n->list_lock);
   3395	ac->avail -= batchcount;
   3396	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
   3397	slabs_destroy(cachep, &list);
   3398}
   3399
   3400/*
   3401 * Release an obj back to its cache. If the obj has a constructed state, it must
   3402 * be in this state _before_ it is released.  Called with disabled ints.
   3403 */
   3404static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
   3405					 unsigned long caller)
   3406{
   3407	bool init;
   3408
   3409	if (is_kfence_address(objp)) {
   3410		kmemleak_free_recursive(objp, cachep->flags);
   3411		memcg_slab_free_hook(cachep, &objp, 1);
   3412		__kfence_free(objp);
   3413		return;
   3414	}
   3415
   3416	/*
   3417	 * As memory initialization might be integrated into KASAN,
   3418	 * kasan_slab_free and initialization memset must be
   3419	 * kept together to avoid discrepancies in behavior.
   3420	 */
   3421	init = slab_want_init_on_free(cachep);
   3422	if (init && !kasan_has_integrated_init())
   3423		memset(objp, 0, cachep->object_size);
   3424	/* KASAN might put objp into memory quarantine, delaying its reuse. */
   3425	if (kasan_slab_free(cachep, objp, init))
   3426		return;
   3427
   3428	/* Use KCSAN to help debug racy use-after-free. */
   3429	if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU))
   3430		__kcsan_check_access(objp, cachep->object_size,
   3431				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
   3432
   3433	___cache_free(cachep, objp, caller);
   3434}
   3435
   3436void ___cache_free(struct kmem_cache *cachep, void *objp,
   3437		unsigned long caller)
   3438{
   3439	struct array_cache *ac = cpu_cache_get(cachep);
   3440
   3441	check_irq_off();
   3442	kmemleak_free_recursive(objp, cachep->flags);
   3443	objp = cache_free_debugcheck(cachep, objp, caller);
   3444	memcg_slab_free_hook(cachep, &objp, 1);
   3445
   3446	/*
   3447	 * Skip calling cache_free_alien() when the platform is not numa.
   3448	 * This will avoid cache misses that happen while accessing slabp (which
   3449	 * is per page memory  reference) to get nodeid. Instead use a global
   3450	 * variable to skip the call, which is mostly likely to be present in
   3451	 * the cache.
   3452	 */
   3453	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
   3454		return;
   3455
   3456	if (ac->avail < ac->limit) {
   3457		STATS_INC_FREEHIT(cachep);
   3458	} else {
   3459		STATS_INC_FREEMISS(cachep);
   3460		cache_flusharray(cachep, ac);
   3461	}
   3462
   3463	if (sk_memalloc_socks()) {
   3464		struct slab *slab = virt_to_slab(objp);
   3465
   3466		if (unlikely(slab_test_pfmemalloc(slab))) {
   3467			cache_free_pfmemalloc(cachep, slab, objp);
   3468			return;
   3469		}
   3470	}
   3471
   3472	__free_one(ac, objp);
   3473}
   3474
   3475static __always_inline
   3476void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
   3477			     gfp_t flags)
   3478{
   3479	void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);
   3480
   3481	trace_kmem_cache_alloc(_RET_IP_, ret,
   3482			       cachep->object_size, cachep->size, flags);
   3483
   3484	return ret;
   3485}
   3486
   3487/**
   3488 * kmem_cache_alloc - Allocate an object
   3489 * @cachep: The cache to allocate from.
   3490 * @flags: See kmalloc().
   3491 *
   3492 * Allocate an object from this cache.  The flags are only relevant
   3493 * if the cache has no available objects.
   3494 *
   3495 * Return: pointer to the new object or %NULL in case of error
   3496 */
   3497void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
   3498{
   3499	return __kmem_cache_alloc_lru(cachep, NULL, flags);
   3500}
   3501EXPORT_SYMBOL(kmem_cache_alloc);
   3502
   3503void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
   3504			   gfp_t flags)
   3505{
   3506	return __kmem_cache_alloc_lru(cachep, lru, flags);
   3507}
   3508EXPORT_SYMBOL(kmem_cache_alloc_lru);
   3509
   3510static __always_inline void
   3511cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
   3512				  size_t size, void **p, unsigned long caller)
   3513{
   3514	size_t i;
   3515
   3516	for (i = 0; i < size; i++)
   3517		p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
   3518}
   3519
   3520int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
   3521			  void **p)
   3522{
   3523	size_t i;
   3524	struct obj_cgroup *objcg = NULL;
   3525
   3526	s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
   3527	if (!s)
   3528		return 0;
   3529
   3530	cache_alloc_debugcheck_before(s, flags);
   3531
   3532	local_irq_disable();
   3533	for (i = 0; i < size; i++) {
   3534		void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
   3535
   3536		if (unlikely(!objp))
   3537			goto error;
   3538		p[i] = objp;
   3539	}
   3540	local_irq_enable();
   3541
   3542	cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
   3543
   3544	/*
   3545	 * memcg and kmem_cache debug support and memory initialization.
   3546	 * Done outside of the IRQ disabled section.
   3547	 */
   3548	slab_post_alloc_hook(s, objcg, flags, size, p,
   3549				slab_want_init_on_alloc(flags, s));
   3550	/* FIXME: Trace call missing. Christoph would like a bulk variant */
   3551	return size;
   3552error:
   3553	local_irq_enable();
   3554	cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
   3555	slab_post_alloc_hook(s, objcg, flags, i, p, false);
   3556	__kmem_cache_free_bulk(s, i, p);
   3557	return 0;
   3558}
   3559EXPORT_SYMBOL(kmem_cache_alloc_bulk);
   3560
   3561#ifdef CONFIG_TRACING
   3562void *
   3563kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
   3564{
   3565	void *ret;
   3566
   3567	ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_);
   3568
   3569	ret = kasan_kmalloc(cachep, ret, size, flags);
   3570	trace_kmalloc(_RET_IP_, ret,
   3571		      size, cachep->size, flags);
   3572	return ret;
   3573}
   3574EXPORT_SYMBOL(kmem_cache_alloc_trace);
   3575#endif
   3576
   3577#ifdef CONFIG_NUMA
   3578/**
   3579 * kmem_cache_alloc_node - Allocate an object on the specified node
   3580 * @cachep: The cache to allocate from.
   3581 * @flags: See kmalloc().
   3582 * @nodeid: node number of the target node.
   3583 *
   3584 * Identical to kmem_cache_alloc but it will allocate memory on the given
   3585 * node, which can improve the performance for cpu bound structures.
   3586 *
   3587 * Fallback to other node is possible if __GFP_THISNODE is not set.
   3588 *
   3589 * Return: pointer to the new object or %NULL in case of error
   3590 */
   3591void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
   3592{
   3593	void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
   3594
   3595	trace_kmem_cache_alloc_node(_RET_IP_, ret,
   3596				    cachep->object_size, cachep->size,
   3597				    flags, nodeid);
   3598
   3599	return ret;
   3600}
   3601EXPORT_SYMBOL(kmem_cache_alloc_node);
   3602
   3603#ifdef CONFIG_TRACING
   3604void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
   3605				  gfp_t flags,
   3606				  int nodeid,
   3607				  size_t size)
   3608{
   3609	void *ret;
   3610
   3611	ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
   3612
   3613	ret = kasan_kmalloc(cachep, ret, size, flags);
   3614	trace_kmalloc_node(_RET_IP_, ret,
   3615			   size, cachep->size,
   3616			   flags, nodeid);
   3617	return ret;
   3618}
   3619EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
   3620#endif
   3621
   3622static __always_inline void *
   3623__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
   3624{
   3625	struct kmem_cache *cachep;
   3626	void *ret;
   3627
   3628	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
   3629		return NULL;
   3630	cachep = kmalloc_slab(size, flags);
   3631	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
   3632		return cachep;
   3633	ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
   3634	ret = kasan_kmalloc(cachep, ret, size, flags);
   3635
   3636	return ret;
   3637}
   3638
   3639void *__kmalloc_node(size_t size, gfp_t flags, int node)
   3640{
   3641	return __do_kmalloc_node(size, flags, node, _RET_IP_);
   3642}
   3643EXPORT_SYMBOL(__kmalloc_node);
   3644
   3645void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
   3646		int node, unsigned long caller)
   3647{
   3648	return __do_kmalloc_node(size, flags, node, caller);
   3649}
   3650EXPORT_SYMBOL(__kmalloc_node_track_caller);
   3651#endif /* CONFIG_NUMA */
   3652
   3653#ifdef CONFIG_PRINTK
   3654void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
   3655{
   3656	struct kmem_cache *cachep;
   3657	unsigned int objnr;
   3658	void *objp;
   3659
   3660	kpp->kp_ptr = object;
   3661	kpp->kp_slab = slab;
   3662	cachep = slab->slab_cache;
   3663	kpp->kp_slab_cache = cachep;
   3664	objp = object - obj_offset(cachep);
   3665	kpp->kp_data_offset = obj_offset(cachep);
   3666	slab = virt_to_slab(objp);
   3667	objnr = obj_to_index(cachep, slab, objp);
   3668	objp = index_to_obj(cachep, slab, objnr);
   3669	kpp->kp_objp = objp;
   3670	if (DEBUG && cachep->flags & SLAB_STORE_USER)
   3671		kpp->kp_ret = *dbg_userword(cachep, objp);
   3672}
   3673#endif
   3674
   3675/**
   3676 * __do_kmalloc - allocate memory
   3677 * @size: how many bytes of memory are required.
   3678 * @flags: the type of memory to allocate (see kmalloc).
   3679 * @caller: function caller for debug tracking of the caller
   3680 *
   3681 * Return: pointer to the allocated memory or %NULL in case of error
   3682 */
   3683static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
   3684					  unsigned long caller)
   3685{
   3686	struct kmem_cache *cachep;
   3687	void *ret;
   3688
   3689	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
   3690		return NULL;
   3691	cachep = kmalloc_slab(size, flags);
   3692	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
   3693		return cachep;
   3694	ret = slab_alloc(cachep, NULL, flags, size, caller);
   3695
   3696	ret = kasan_kmalloc(cachep, ret, size, flags);
   3697	trace_kmalloc(caller, ret,
   3698		      size, cachep->size, flags);
   3699
   3700	return ret;
   3701}
   3702
   3703void *__kmalloc(size_t size, gfp_t flags)
   3704{
   3705	return __do_kmalloc(size, flags, _RET_IP_);
   3706}
   3707EXPORT_SYMBOL(__kmalloc);
   3708
   3709void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
   3710{
   3711	return __do_kmalloc(size, flags, caller);
   3712}
   3713EXPORT_SYMBOL(__kmalloc_track_caller);
   3714
   3715/**
   3716 * kmem_cache_free - Deallocate an object
   3717 * @cachep: The cache the allocation was from.
   3718 * @objp: The previously allocated object.
   3719 *
   3720 * Free an object which was previously allocated from this
   3721 * cache.
   3722 */
   3723void kmem_cache_free(struct kmem_cache *cachep, void *objp)
   3724{
   3725	unsigned long flags;
   3726	cachep = cache_from_obj(cachep, objp);
   3727	if (!cachep)
   3728		return;
   3729
   3730	trace_kmem_cache_free(_RET_IP_, objp, cachep->name);
   3731	local_irq_save(flags);
   3732	debug_check_no_locks_freed(objp, cachep->object_size);
   3733	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
   3734		debug_check_no_obj_freed(objp, cachep->object_size);
   3735	__cache_free(cachep, objp, _RET_IP_);
   3736	local_irq_restore(flags);
   3737}
   3738EXPORT_SYMBOL(kmem_cache_free);
   3739
   3740void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
   3741{
   3742	struct kmem_cache *s;
   3743	size_t i;
   3744
   3745	local_irq_disable();
   3746	for (i = 0; i < size; i++) {
   3747		void *objp = p[i];
   3748
   3749		if (!orig_s) /* called via kfree_bulk */
   3750			s = virt_to_cache(objp);
   3751		else
   3752			s = cache_from_obj(orig_s, objp);
   3753		if (!s)
   3754			continue;
   3755
   3756		debug_check_no_locks_freed(objp, s->object_size);
   3757		if (!(s->flags & SLAB_DEBUG_OBJECTS))
   3758			debug_check_no_obj_freed(objp, s->object_size);
   3759
   3760		__cache_free(s, objp, _RET_IP_);
   3761	}
   3762	local_irq_enable();
   3763
   3764	/* FIXME: add tracing */
   3765}
   3766EXPORT_SYMBOL(kmem_cache_free_bulk);
   3767
   3768/**
   3769 * kfree - free previously allocated memory
   3770 * @objp: pointer returned by kmalloc.
   3771 *
   3772 * If @objp is NULL, no operation is performed.
   3773 *
   3774 * Don't free memory not originally allocated by kmalloc()
   3775 * or you will run into trouble.
   3776 */
   3777void kfree(const void *objp)
   3778{
   3779	struct kmem_cache *c;
   3780	unsigned long flags;
   3781
   3782	trace_kfree(_RET_IP_, objp);
   3783
   3784	if (unlikely(ZERO_OR_NULL_PTR(objp)))
   3785		return;
   3786	local_irq_save(flags);
   3787	kfree_debugcheck(objp);
   3788	c = virt_to_cache(objp);
   3789	if (!c) {
   3790		local_irq_restore(flags);
   3791		return;
   3792	}
   3793	debug_check_no_locks_freed(objp, c->object_size);
   3794
   3795	debug_check_no_obj_freed(objp, c->object_size);
   3796	__cache_free(c, (void *)objp, _RET_IP_);
   3797	local_irq_restore(flags);
   3798}
   3799EXPORT_SYMBOL(kfree);
   3800
   3801/*
   3802 * This initializes kmem_cache_node or resizes various caches for all nodes.
   3803 */
   3804static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
   3805{
   3806	int ret;
   3807	int node;
   3808	struct kmem_cache_node *n;
   3809
   3810	for_each_online_node(node) {
   3811		ret = setup_kmem_cache_node(cachep, node, gfp, true);
   3812		if (ret)
   3813			goto fail;
   3814
   3815	}
   3816
   3817	return 0;
   3818
   3819fail:
   3820	if (!cachep->list.next) {
   3821		/* Cache is not active yet. Roll back what we did */
   3822		node--;
   3823		while (node >= 0) {
   3824			n = get_node(cachep, node);
   3825			if (n) {
   3826				kfree(n->shared);
   3827				free_alien_cache(n->alien);
   3828				kfree(n);
   3829				cachep->node[node] = NULL;
   3830			}
   3831			node--;
   3832		}
   3833	}
   3834	return -ENOMEM;
   3835}
   3836
   3837/* Always called with the slab_mutex held */
   3838static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
   3839			    int batchcount, int shared, gfp_t gfp)
   3840{
   3841	struct array_cache __percpu *cpu_cache, *prev;
   3842	int cpu;
   3843
   3844	cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
   3845	if (!cpu_cache)
   3846		return -ENOMEM;
   3847
   3848	prev = cachep->cpu_cache;
   3849	cachep->cpu_cache = cpu_cache;
   3850	/*
   3851	 * Without a previous cpu_cache there's no need to synchronize remote
   3852	 * cpus, so skip the IPIs.
   3853	 */
   3854	if (prev)
   3855		kick_all_cpus_sync();
   3856
   3857	check_irq_on();
   3858	cachep->batchcount = batchcount;
   3859	cachep->limit = limit;
   3860	cachep->shared = shared;
   3861
   3862	if (!prev)
   3863		goto setup_node;
   3864
   3865	for_each_online_cpu(cpu) {
   3866		LIST_HEAD(list);
   3867		int node;
   3868		struct kmem_cache_node *n;
   3869		struct array_cache *ac = per_cpu_ptr(prev, cpu);
   3870
   3871		node = cpu_to_mem(cpu);
   3872		n = get_node(cachep, node);
   3873		spin_lock_irq(&n->list_lock);
   3874		free_block(cachep, ac->entry, ac->avail, node, &list);
   3875		spin_unlock_irq(&n->list_lock);
   3876		slabs_destroy(cachep, &list);
   3877	}
   3878	free_percpu(prev);
   3879
   3880setup_node:
   3881	return setup_kmem_cache_nodes(cachep, gfp);
   3882}
   3883
   3884/* Called with slab_mutex held always */
   3885static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
   3886{
   3887	int err;
   3888	int limit = 0;
   3889	int shared = 0;
   3890	int batchcount = 0;
   3891
   3892	err = cache_random_seq_create(cachep, cachep->num, gfp);
   3893	if (err)
   3894		goto end;
   3895
   3896	/*
   3897	 * The head array serves three purposes:
   3898	 * - create a LIFO ordering, i.e. return objects that are cache-warm
   3899	 * - reduce the number of spinlock operations.
   3900	 * - reduce the number of linked list operations on the slab and
   3901	 *   bufctl chains: array operations are cheaper.
   3902	 * The numbers are guessed, we should auto-tune as described by
   3903	 * Bonwick.
   3904	 */
   3905	if (cachep->size > 131072)
   3906		limit = 1;
   3907	else if (cachep->size > PAGE_SIZE)
   3908		limit = 8;
   3909	else if (cachep->size > 1024)
   3910		limit = 24;
   3911	else if (cachep->size > 256)
   3912		limit = 54;
   3913	else
   3914		limit = 120;
   3915
   3916	/*
   3917	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
   3918	 * allocation behaviour: Most allocs on one cpu, most free operations
   3919	 * on another cpu. For these cases, an efficient object passing between
   3920	 * cpus is necessary. This is provided by a shared array. The array
   3921	 * replaces Bonwick's magazine layer.
   3922	 * On uniprocessor, it's functionally equivalent (but less efficient)
   3923	 * to a larger limit. Thus disabled by default.
   3924	 */
   3925	shared = 0;
   3926	if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
   3927		shared = 8;
   3928
   3929#if DEBUG
   3930	/*
   3931	 * With debugging enabled, large batchcount lead to excessively long
   3932	 * periods with disabled local interrupts. Limit the batchcount
   3933	 */
   3934	if (limit > 32)
   3935		limit = 32;
   3936#endif
   3937	batchcount = (limit + 1) / 2;
   3938	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
   3939end:
   3940	if (err)
   3941		pr_err("enable_cpucache failed for %s, error %d\n",
   3942		       cachep->name, -err);
   3943	return err;
   3944}
   3945
   3946/*
   3947 * Drain an array if it contains any elements taking the node lock only if
   3948 * necessary. Note that the node listlock also protects the array_cache
   3949 * if drain_array() is used on the shared array.
   3950 */
   3951static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
   3952			 struct array_cache *ac, int node)
   3953{
   3954	LIST_HEAD(list);
   3955
   3956	/* ac from n->shared can be freed if we don't hold the slab_mutex. */
   3957	check_mutex_acquired();
   3958
   3959	if (!ac || !ac->avail)
   3960		return;
   3961
   3962	if (ac->touched) {
   3963		ac->touched = 0;
   3964		return;
   3965	}
   3966
   3967	spin_lock_irq(&n->list_lock);
   3968	drain_array_locked(cachep, ac, node, false, &list);
   3969	spin_unlock_irq(&n->list_lock);
   3970
   3971	slabs_destroy(cachep, &list);
   3972}
   3973
   3974/**
   3975 * cache_reap - Reclaim memory from caches.
   3976 * @w: work descriptor
   3977 *
   3978 * Called from workqueue/eventd every few seconds.
   3979 * Purpose:
   3980 * - clear the per-cpu caches for this CPU.
   3981 * - return freeable pages to the main free memory pool.
   3982 *
   3983 * If we cannot acquire the cache chain mutex then just give up - we'll try
   3984 * again on the next iteration.
   3985 */
   3986static void cache_reap(struct work_struct *w)
   3987{
   3988	struct kmem_cache *searchp;
   3989	struct kmem_cache_node *n;
   3990	int node = numa_mem_id();
   3991	struct delayed_work *work = to_delayed_work(w);
   3992
   3993	if (!mutex_trylock(&slab_mutex))
   3994		/* Give up. Setup the next iteration. */
   3995		goto out;
   3996
   3997	list_for_each_entry(searchp, &slab_caches, list) {
   3998		check_irq_on();
   3999
   4000		/*
   4001		 * We only take the node lock if absolutely necessary and we
   4002		 * have established with reasonable certainty that
   4003		 * we can do some work if the lock was obtained.
   4004		 */
   4005		n = get_node(searchp, node);
   4006
   4007		reap_alien(searchp, n);
   4008
   4009		drain_array(searchp, n, cpu_cache_get(searchp), node);
   4010
   4011		/*
   4012		 * These are racy checks but it does not matter
   4013		 * if we skip one check or scan twice.
   4014		 */
   4015		if (time_after(n->next_reap, jiffies))
   4016			goto next;
   4017
   4018		n->next_reap = jiffies + REAPTIMEOUT_NODE;
   4019
   4020		drain_array(searchp, n, n->shared, node);
   4021
   4022		if (n->free_touched)
   4023			n->free_touched = 0;
   4024		else {
   4025			int freed;
   4026
   4027			freed = drain_freelist(searchp, n, (n->free_limit +
   4028				5 * searchp->num - 1) / (5 * searchp->num));
   4029			STATS_ADD_REAPED(searchp, freed);
   4030		}
   4031next:
   4032		cond_resched();
   4033	}
   4034	check_irq_on();
   4035	mutex_unlock(&slab_mutex);
   4036	next_reap_node();
   4037out:
   4038	/* Set up the next iteration */
   4039	schedule_delayed_work_on(smp_processor_id(), work,
   4040				round_jiffies_relative(REAPTIMEOUT_AC));
   4041}
   4042
   4043void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
   4044{
   4045	unsigned long active_objs, num_objs, active_slabs;
   4046	unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
   4047	unsigned long free_slabs = 0;
   4048	int node;
   4049	struct kmem_cache_node *n;
   4050
   4051	for_each_kmem_cache_node(cachep, node, n) {
   4052		check_irq_on();
   4053		spin_lock_irq(&n->list_lock);
   4054
   4055		total_slabs += n->total_slabs;
   4056		free_slabs += n->free_slabs;
   4057		free_objs += n->free_objects;
   4058
   4059		if (n->shared)
   4060			shared_avail += n->shared->avail;
   4061
   4062		spin_unlock_irq(&n->list_lock);
   4063	}
   4064	num_objs = total_slabs * cachep->num;
   4065	active_slabs = total_slabs - free_slabs;
   4066	active_objs = num_objs - free_objs;
   4067
   4068	sinfo->active_objs = active_objs;
   4069	sinfo->num_objs = num_objs;
   4070	sinfo->active_slabs = active_slabs;
   4071	sinfo->num_slabs = total_slabs;
   4072	sinfo->shared_avail = shared_avail;
   4073	sinfo->limit = cachep->limit;
   4074	sinfo->batchcount = cachep->batchcount;
   4075	sinfo->shared = cachep->shared;
   4076	sinfo->objects_per_slab = cachep->num;
   4077	sinfo->cache_order = cachep->gfporder;
   4078}
   4079
   4080void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
   4081{
   4082#if STATS
   4083	{			/* node stats */
   4084		unsigned long high = cachep->high_mark;
   4085		unsigned long allocs = cachep->num_allocations;
   4086		unsigned long grown = cachep->grown;
   4087		unsigned long reaped = cachep->reaped;
   4088		unsigned long errors = cachep->errors;
   4089		unsigned long max_freeable = cachep->max_freeable;
   4090		unsigned long node_allocs = cachep->node_allocs;
   4091		unsigned long node_frees = cachep->node_frees;
   4092		unsigned long overflows = cachep->node_overflow;
   4093
   4094		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
   4095			   allocs, high, grown,
   4096			   reaped, errors, max_freeable, node_allocs,
   4097			   node_frees, overflows);
   4098	}
   4099	/* cpu stats */
   4100	{
   4101		unsigned long allochit = atomic_read(&cachep->allochit);
   4102		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
   4103		unsigned long freehit = atomic_read(&cachep->freehit);
   4104		unsigned long freemiss = atomic_read(&cachep->freemiss);
   4105
   4106		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
   4107			   allochit, allocmiss, freehit, freemiss);
   4108	}
   4109#endif
   4110}
   4111
   4112#define MAX_SLABINFO_WRITE 128
   4113/**
   4114 * slabinfo_write - Tuning for the slab allocator
   4115 * @file: unused
   4116 * @buffer: user buffer
   4117 * @count: data length
   4118 * @ppos: unused
   4119 *
   4120 * Return: %0 on success, negative error code otherwise.
   4121 */
   4122ssize_t slabinfo_write(struct file *file, const char __user *buffer,
   4123		       size_t count, loff_t *ppos)
   4124{
   4125	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
   4126	int limit, batchcount, shared, res;
   4127	struct kmem_cache *cachep;
   4128
   4129	if (count > MAX_SLABINFO_WRITE)
   4130		return -EINVAL;
   4131	if (copy_from_user(&kbuf, buffer, count))
   4132		return -EFAULT;
   4133	kbuf[MAX_SLABINFO_WRITE] = '\0';
   4134
   4135	tmp = strchr(kbuf, ' ');
   4136	if (!tmp)
   4137		return -EINVAL;
   4138	*tmp = '\0';
   4139	tmp++;
   4140	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
   4141		return -EINVAL;
   4142
   4143	/* Find the cache in the chain of caches. */
   4144	mutex_lock(&slab_mutex);
   4145	res = -EINVAL;
   4146	list_for_each_entry(cachep, &slab_caches, list) {
   4147		if (!strcmp(cachep->name, kbuf)) {
   4148			if (limit < 1 || batchcount < 1 ||
   4149					batchcount > limit || shared < 0) {
   4150				res = 0;
   4151			} else {
   4152				res = do_tune_cpucache(cachep, limit,
   4153						       batchcount, shared,
   4154						       GFP_KERNEL);
   4155			}
   4156			break;
   4157		}
   4158	}
   4159	mutex_unlock(&slab_mutex);
   4160	if (res >= 0)
   4161		res = count;
   4162	return res;
   4163}
   4164
   4165#ifdef CONFIG_HARDENED_USERCOPY
   4166/*
   4167 * Rejects incorrectly sized objects and objects that are to be copied
   4168 * to/from userspace but do not fall entirely within the containing slab
   4169 * cache's usercopy region.
   4170 *
   4171 * Returns NULL if check passes, otherwise const char * to name of cache
   4172 * to indicate an error.
   4173 */
   4174void __check_heap_object(const void *ptr, unsigned long n,
   4175			 const struct slab *slab, bool to_user)
   4176{
   4177	struct kmem_cache *cachep;
   4178	unsigned int objnr;
   4179	unsigned long offset;
   4180
   4181	ptr = kasan_reset_tag(ptr);
   4182
   4183	/* Find and validate object. */
   4184	cachep = slab->slab_cache;
   4185	objnr = obj_to_index(cachep, slab, (void *)ptr);
   4186	BUG_ON(objnr >= cachep->num);
   4187
   4188	/* Find offset within object. */
   4189	if (is_kfence_address(ptr))
   4190		offset = ptr - kfence_object_start(ptr);
   4191	else
   4192		offset = ptr - index_to_obj(cachep, slab, objnr) - obj_offset(cachep);
   4193
   4194	/* Allow address range falling entirely within usercopy region. */
   4195	if (offset >= cachep->useroffset &&
   4196	    offset - cachep->useroffset <= cachep->usersize &&
   4197	    n <= cachep->useroffset - offset + cachep->usersize)
   4198		return;
   4199
   4200	usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
   4201}
   4202#endif /* CONFIG_HARDENED_USERCOPY */
   4203
   4204/**
   4205 * __ksize -- Uninstrumented ksize.
   4206 * @objp: pointer to the object
   4207 *
   4208 * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
   4209 * safety checks as ksize() with KASAN instrumentation enabled.
   4210 *
   4211 * Return: size of the actual memory used by @objp in bytes
   4212 */
   4213size_t __ksize(const void *objp)
   4214{
   4215	struct kmem_cache *c;
   4216	size_t size;
   4217
   4218	BUG_ON(!objp);
   4219	if (unlikely(objp == ZERO_SIZE_PTR))
   4220		return 0;
   4221
   4222	c = virt_to_cache(objp);
   4223	size = c ? c->object_size : 0;
   4224
   4225	return size;
   4226}
   4227EXPORT_SYMBOL(__ksize);