cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

memcontrol.c (201008B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/* memcontrol.c - Memory Controller
      3 *
      4 * Copyright IBM Corporation, 2007
      5 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
      6 *
      7 * Copyright 2007 OpenVZ SWsoft Inc
      8 * Author: Pavel Emelianov <xemul@openvz.org>
      9 *
     10 * Memory thresholds
     11 * Copyright (C) 2009 Nokia Corporation
     12 * Author: Kirill A. Shutemov
     13 *
     14 * Kernel Memory Controller
     15 * Copyright (C) 2012 Parallels Inc. and Google Inc.
     16 * Authors: Glauber Costa and Suleiman Souhlal
     17 *
     18 * Native page reclaim
     19 * Charge lifetime sanitation
     20 * Lockless page tracking & accounting
     21 * Unified hierarchy configuration model
     22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
     23 *
     24 * Per memcg lru locking
     25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi
     26 */
     27
     28#include <linux/page_counter.h>
     29#include <linux/memcontrol.h>
     30#include <linux/cgroup.h>
     31#include <linux/pagewalk.h>
     32#include <linux/sched/mm.h>
     33#include <linux/shmem_fs.h>
     34#include <linux/hugetlb.h>
     35#include <linux/pagemap.h>
     36#include <linux/vm_event_item.h>
     37#include <linux/smp.h>
     38#include <linux/page-flags.h>
     39#include <linux/backing-dev.h>
     40#include <linux/bit_spinlock.h>
     41#include <linux/rcupdate.h>
     42#include <linux/limits.h>
     43#include <linux/export.h>
     44#include <linux/mutex.h>
     45#include <linux/rbtree.h>
     46#include <linux/slab.h>
     47#include <linux/swap.h>
     48#include <linux/swapops.h>
     49#include <linux/spinlock.h>
     50#include <linux/eventfd.h>
     51#include <linux/poll.h>
     52#include <linux/sort.h>
     53#include <linux/fs.h>
     54#include <linux/seq_file.h>
     55#include <linux/vmpressure.h>
     56#include <linux/memremap.h>
     57#include <linux/mm_inline.h>
     58#include <linux/swap_cgroup.h>
     59#include <linux/cpu.h>
     60#include <linux/oom.h>
     61#include <linux/lockdep.h>
     62#include <linux/file.h>
     63#include <linux/resume_user_mode.h>
     64#include <linux/psi.h>
     65#include <linux/seq_buf.h>
     66#include "internal.h"
     67#include <net/sock.h>
     68#include <net/ip.h>
     69#include "slab.h"
     70#include "swap.h"
     71
     72#include <linux/uaccess.h>
     73
     74#include <trace/events/vmscan.h>
     75
     76struct cgroup_subsys memory_cgrp_subsys __read_mostly;
     77EXPORT_SYMBOL(memory_cgrp_subsys);
     78
     79struct mem_cgroup *root_mem_cgroup __read_mostly;
     80
     81/* Active memory cgroup to use from an interrupt context */
     82DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
     83EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
     84
     85/* Socket memory accounting disabled? */
     86static bool cgroup_memory_nosocket __ro_after_init;
     87
     88/* Kernel memory accounting disabled? */
     89static bool cgroup_memory_nokmem __ro_after_init;
     90
     91/* Whether the swap controller is active */
     92#ifdef CONFIG_MEMCG_SWAP
     93static bool cgroup_memory_noswap __ro_after_init;
     94#else
     95#define cgroup_memory_noswap		1
     96#endif
     97
     98#ifdef CONFIG_CGROUP_WRITEBACK
     99static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
    100#endif
    101
    102/* Whether legacy memory+swap accounting is active */
    103static bool do_memsw_account(void)
    104{
    105	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
    106}
    107
    108#define THRESHOLDS_EVENTS_TARGET 128
    109#define SOFTLIMIT_EVENTS_TARGET 1024
    110
    111/*
    112 * Cgroups above their limits are maintained in a RB-Tree, independent of
    113 * their hierarchy representation
    114 */
    115
    116struct mem_cgroup_tree_per_node {
    117	struct rb_root rb_root;
    118	struct rb_node *rb_rightmost;
    119	spinlock_t lock;
    120};
    121
    122struct mem_cgroup_tree {
    123	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
    124};
    125
    126static struct mem_cgroup_tree soft_limit_tree __read_mostly;
    127
    128/* for OOM */
    129struct mem_cgroup_eventfd_list {
    130	struct list_head list;
    131	struct eventfd_ctx *eventfd;
    132};
    133
    134/*
    135 * cgroup_event represents events which userspace want to receive.
    136 */
    137struct mem_cgroup_event {
    138	/*
    139	 * memcg which the event belongs to.
    140	 */
    141	struct mem_cgroup *memcg;
    142	/*
    143	 * eventfd to signal userspace about the event.
    144	 */
    145	struct eventfd_ctx *eventfd;
    146	/*
    147	 * Each of these stored in a list by the cgroup.
    148	 */
    149	struct list_head list;
    150	/*
    151	 * register_event() callback will be used to add new userspace
    152	 * waiter for changes related to this event.  Use eventfd_signal()
    153	 * on eventfd to send notification to userspace.
    154	 */
    155	int (*register_event)(struct mem_cgroup *memcg,
    156			      struct eventfd_ctx *eventfd, const char *args);
    157	/*
    158	 * unregister_event() callback will be called when userspace closes
    159	 * the eventfd or on cgroup removing.  This callback must be set,
    160	 * if you want provide notification functionality.
    161	 */
    162	void (*unregister_event)(struct mem_cgroup *memcg,
    163				 struct eventfd_ctx *eventfd);
    164	/*
    165	 * All fields below needed to unregister event when
    166	 * userspace closes eventfd.
    167	 */
    168	poll_table pt;
    169	wait_queue_head_t *wqh;
    170	wait_queue_entry_t wait;
    171	struct work_struct remove;
    172};
    173
    174static void mem_cgroup_threshold(struct mem_cgroup *memcg);
    175static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
    176
    177/* Stuffs for move charges at task migration. */
    178/*
    179 * Types of charges to be moved.
    180 */
    181#define MOVE_ANON	0x1U
    182#define MOVE_FILE	0x2U
    183#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
    184
    185/* "mc" and its members are protected by cgroup_mutex */
    186static struct move_charge_struct {
    187	spinlock_t	  lock; /* for from, to */
    188	struct mm_struct  *mm;
    189	struct mem_cgroup *from;
    190	struct mem_cgroup *to;
    191	unsigned long flags;
    192	unsigned long precharge;
    193	unsigned long moved_charge;
    194	unsigned long moved_swap;
    195	struct task_struct *moving_task;	/* a task moving charges */
    196	wait_queue_head_t waitq;		/* a waitq for other context */
    197} mc = {
    198	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
    199	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
    200};
    201
    202/*
    203 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
    204 * limit reclaim to prevent infinite loops, if they ever occur.
    205 */
    206#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
    207#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
    208
    209/* for encoding cft->private value on file */
    210enum res_type {
    211	_MEM,
    212	_MEMSWAP,
    213	_KMEM,
    214	_TCP,
    215};
    216
    217#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
    218#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
    219#define MEMFILE_ATTR(val)	((val) & 0xffff)
    220
    221/*
    222 * Iteration constructs for visiting all cgroups (under a tree).  If
    223 * loops are exited prematurely (break), mem_cgroup_iter_break() must
    224 * be used for reference counting.
    225 */
    226#define for_each_mem_cgroup_tree(iter, root)		\
    227	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
    228	     iter != NULL;				\
    229	     iter = mem_cgroup_iter(root, iter, NULL))
    230
    231#define for_each_mem_cgroup(iter)			\
    232	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
    233	     iter != NULL;				\
    234	     iter = mem_cgroup_iter(NULL, iter, NULL))
    235
    236static inline bool task_is_dying(void)
    237{
    238	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
    239		(current->flags & PF_EXITING);
    240}
    241
    242/* Some nice accessors for the vmpressure. */
    243struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
    244{
    245	if (!memcg)
    246		memcg = root_mem_cgroup;
    247	return &memcg->vmpressure;
    248}
    249
    250struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
    251{
    252	return container_of(vmpr, struct mem_cgroup, vmpressure);
    253}
    254
    255#ifdef CONFIG_MEMCG_KMEM
    256static DEFINE_SPINLOCK(objcg_lock);
    257
    258bool mem_cgroup_kmem_disabled(void)
    259{
    260	return cgroup_memory_nokmem;
    261}
    262
    263static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
    264				      unsigned int nr_pages);
    265
    266static void obj_cgroup_release(struct percpu_ref *ref)
    267{
    268	struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
    269	unsigned int nr_bytes;
    270	unsigned int nr_pages;
    271	unsigned long flags;
    272
    273	/*
    274	 * At this point all allocated objects are freed, and
    275	 * objcg->nr_charged_bytes can't have an arbitrary byte value.
    276	 * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
    277	 *
    278	 * The following sequence can lead to it:
    279	 * 1) CPU0: objcg == stock->cached_objcg
    280	 * 2) CPU1: we do a small allocation (e.g. 92 bytes),
    281	 *          PAGE_SIZE bytes are charged
    282	 * 3) CPU1: a process from another memcg is allocating something,
    283	 *          the stock if flushed,
    284	 *          objcg->nr_charged_bytes = PAGE_SIZE - 92
    285	 * 5) CPU0: we do release this object,
    286	 *          92 bytes are added to stock->nr_bytes
    287	 * 6) CPU0: stock is flushed,
    288	 *          92 bytes are added to objcg->nr_charged_bytes
    289	 *
    290	 * In the result, nr_charged_bytes == PAGE_SIZE.
    291	 * This page will be uncharged in obj_cgroup_release().
    292	 */
    293	nr_bytes = atomic_read(&objcg->nr_charged_bytes);
    294	WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
    295	nr_pages = nr_bytes >> PAGE_SHIFT;
    296
    297	if (nr_pages)
    298		obj_cgroup_uncharge_pages(objcg, nr_pages);
    299
    300	spin_lock_irqsave(&objcg_lock, flags);
    301	list_del(&objcg->list);
    302	spin_unlock_irqrestore(&objcg_lock, flags);
    303
    304	percpu_ref_exit(ref);
    305	kfree_rcu(objcg, rcu);
    306}
    307
    308static struct obj_cgroup *obj_cgroup_alloc(void)
    309{
    310	struct obj_cgroup *objcg;
    311	int ret;
    312
    313	objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
    314	if (!objcg)
    315		return NULL;
    316
    317	ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
    318			      GFP_KERNEL);
    319	if (ret) {
    320		kfree(objcg);
    321		return NULL;
    322	}
    323	INIT_LIST_HEAD(&objcg->list);
    324	return objcg;
    325}
    326
    327static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
    328				  struct mem_cgroup *parent)
    329{
    330	struct obj_cgroup *objcg, *iter;
    331
    332	objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
    333
    334	spin_lock_irq(&objcg_lock);
    335
    336	/* 1) Ready to reparent active objcg. */
    337	list_add(&objcg->list, &memcg->objcg_list);
    338	/* 2) Reparent active objcg and already reparented objcgs to parent. */
    339	list_for_each_entry(iter, &memcg->objcg_list, list)
    340		WRITE_ONCE(iter->memcg, parent);
    341	/* 3) Move already reparented objcgs to the parent's list */
    342	list_splice(&memcg->objcg_list, &parent->objcg_list);
    343
    344	spin_unlock_irq(&objcg_lock);
    345
    346	percpu_ref_kill(&objcg->refcnt);
    347}
    348
    349/*
    350 * A lot of the calls to the cache allocation functions are expected to be
    351 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
    352 * conditional to this static branch, we'll have to allow modules that does
    353 * kmem_cache_alloc and the such to see this symbol as well
    354 */
    355DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
    356EXPORT_SYMBOL(memcg_kmem_enabled_key);
    357#endif
    358
    359/**
    360 * mem_cgroup_css_from_page - css of the memcg associated with a page
    361 * @page: page of interest
    362 *
    363 * If memcg is bound to the default hierarchy, css of the memcg associated
    364 * with @page is returned.  The returned css remains associated with @page
    365 * until it is released.
    366 *
    367 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
    368 * is returned.
    369 */
    370struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
    371{
    372	struct mem_cgroup *memcg;
    373
    374	memcg = page_memcg(page);
    375
    376	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
    377		memcg = root_mem_cgroup;
    378
    379	return &memcg->css;
    380}
    381
    382/**
    383 * page_cgroup_ino - return inode number of the memcg a page is charged to
    384 * @page: the page
    385 *
    386 * Look up the closest online ancestor of the memory cgroup @page is charged to
    387 * and return its inode number or 0 if @page is not charged to any cgroup. It
    388 * is safe to call this function without holding a reference to @page.
    389 *
    390 * Note, this function is inherently racy, because there is nothing to prevent
    391 * the cgroup inode from getting torn down and potentially reallocated a moment
    392 * after page_cgroup_ino() returns, so it only should be used by callers that
    393 * do not care (such as procfs interfaces).
    394 */
    395ino_t page_cgroup_ino(struct page *page)
    396{
    397	struct mem_cgroup *memcg;
    398	unsigned long ino = 0;
    399
    400	rcu_read_lock();
    401	memcg = page_memcg_check(page);
    402
    403	while (memcg && !(memcg->css.flags & CSS_ONLINE))
    404		memcg = parent_mem_cgroup(memcg);
    405	if (memcg)
    406		ino = cgroup_ino(memcg->css.cgroup);
    407	rcu_read_unlock();
    408	return ino;
    409}
    410
    411static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
    412					 struct mem_cgroup_tree_per_node *mctz,
    413					 unsigned long new_usage_in_excess)
    414{
    415	struct rb_node **p = &mctz->rb_root.rb_node;
    416	struct rb_node *parent = NULL;
    417	struct mem_cgroup_per_node *mz_node;
    418	bool rightmost = true;
    419
    420	if (mz->on_tree)
    421		return;
    422
    423	mz->usage_in_excess = new_usage_in_excess;
    424	if (!mz->usage_in_excess)
    425		return;
    426	while (*p) {
    427		parent = *p;
    428		mz_node = rb_entry(parent, struct mem_cgroup_per_node,
    429					tree_node);
    430		if (mz->usage_in_excess < mz_node->usage_in_excess) {
    431			p = &(*p)->rb_left;
    432			rightmost = false;
    433		} else {
    434			p = &(*p)->rb_right;
    435		}
    436	}
    437
    438	if (rightmost)
    439		mctz->rb_rightmost = &mz->tree_node;
    440
    441	rb_link_node(&mz->tree_node, parent, p);
    442	rb_insert_color(&mz->tree_node, &mctz->rb_root);
    443	mz->on_tree = true;
    444}
    445
    446static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
    447					 struct mem_cgroup_tree_per_node *mctz)
    448{
    449	if (!mz->on_tree)
    450		return;
    451
    452	if (&mz->tree_node == mctz->rb_rightmost)
    453		mctz->rb_rightmost = rb_prev(&mz->tree_node);
    454
    455	rb_erase(&mz->tree_node, &mctz->rb_root);
    456	mz->on_tree = false;
    457}
    458
    459static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
    460				       struct mem_cgroup_tree_per_node *mctz)
    461{
    462	unsigned long flags;
    463
    464	spin_lock_irqsave(&mctz->lock, flags);
    465	__mem_cgroup_remove_exceeded(mz, mctz);
    466	spin_unlock_irqrestore(&mctz->lock, flags);
    467}
    468
    469static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
    470{
    471	unsigned long nr_pages = page_counter_read(&memcg->memory);
    472	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
    473	unsigned long excess = 0;
    474
    475	if (nr_pages > soft_limit)
    476		excess = nr_pages - soft_limit;
    477
    478	return excess;
    479}
    480
    481static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
    482{
    483	unsigned long excess;
    484	struct mem_cgroup_per_node *mz;
    485	struct mem_cgroup_tree_per_node *mctz;
    486
    487	mctz = soft_limit_tree.rb_tree_per_node[nid];
    488	if (!mctz)
    489		return;
    490	/*
    491	 * Necessary to update all ancestors when hierarchy is used.
    492	 * because their event counter is not touched.
    493	 */
    494	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
    495		mz = memcg->nodeinfo[nid];
    496		excess = soft_limit_excess(memcg);
    497		/*
    498		 * We have to update the tree if mz is on RB-tree or
    499		 * mem is over its softlimit.
    500		 */
    501		if (excess || mz->on_tree) {
    502			unsigned long flags;
    503
    504			spin_lock_irqsave(&mctz->lock, flags);
    505			/* if on-tree, remove it */
    506			if (mz->on_tree)
    507				__mem_cgroup_remove_exceeded(mz, mctz);
    508			/*
    509			 * Insert again. mz->usage_in_excess will be updated.
    510			 * If excess is 0, no tree ops.
    511			 */
    512			__mem_cgroup_insert_exceeded(mz, mctz, excess);
    513			spin_unlock_irqrestore(&mctz->lock, flags);
    514		}
    515	}
    516}
    517
    518static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
    519{
    520	struct mem_cgroup_tree_per_node *mctz;
    521	struct mem_cgroup_per_node *mz;
    522	int nid;
    523
    524	for_each_node(nid) {
    525		mz = memcg->nodeinfo[nid];
    526		mctz = soft_limit_tree.rb_tree_per_node[nid];
    527		if (mctz)
    528			mem_cgroup_remove_exceeded(mz, mctz);
    529	}
    530}
    531
    532static struct mem_cgroup_per_node *
    533__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
    534{
    535	struct mem_cgroup_per_node *mz;
    536
    537retry:
    538	mz = NULL;
    539	if (!mctz->rb_rightmost)
    540		goto done;		/* Nothing to reclaim from */
    541
    542	mz = rb_entry(mctz->rb_rightmost,
    543		      struct mem_cgroup_per_node, tree_node);
    544	/*
    545	 * Remove the node now but someone else can add it back,
    546	 * we will to add it back at the end of reclaim to its correct
    547	 * position in the tree.
    548	 */
    549	__mem_cgroup_remove_exceeded(mz, mctz);
    550	if (!soft_limit_excess(mz->memcg) ||
    551	    !css_tryget(&mz->memcg->css))
    552		goto retry;
    553done:
    554	return mz;
    555}
    556
    557static struct mem_cgroup_per_node *
    558mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
    559{
    560	struct mem_cgroup_per_node *mz;
    561
    562	spin_lock_irq(&mctz->lock);
    563	mz = __mem_cgroup_largest_soft_limit_node(mctz);
    564	spin_unlock_irq(&mctz->lock);
    565	return mz;
    566}
    567
    568/*
    569 * memcg and lruvec stats flushing
    570 *
    571 * Many codepaths leading to stats update or read are performance sensitive and
    572 * adding stats flushing in such codepaths is not desirable. So, to optimize the
    573 * flushing the kernel does:
    574 *
    575 * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
    576 *    rstat update tree grow unbounded.
    577 *
    578 * 2) Flush the stats synchronously on reader side only when there are more than
    579 *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
    580 *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
    581 *    only for 2 seconds due to (1).
    582 */
    583static void flush_memcg_stats_dwork(struct work_struct *w);
    584static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
    585static DEFINE_SPINLOCK(stats_flush_lock);
    586static DEFINE_PER_CPU(unsigned int, stats_updates);
    587static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
    588static u64 flush_next_time;
    589
    590#define FLUSH_TIME (2UL*HZ)
    591
    592/*
    593 * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
    594 * not rely on this as part of an acquired spinlock_t lock. These functions are
    595 * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
    596 * is sufficient.
    597 */
    598static void memcg_stats_lock(void)
    599{
    600#ifdef CONFIG_PREEMPT_RT
    601      preempt_disable();
    602#else
    603      VM_BUG_ON(!irqs_disabled());
    604#endif
    605}
    606
    607static void __memcg_stats_lock(void)
    608{
    609#ifdef CONFIG_PREEMPT_RT
    610      preempt_disable();
    611#endif
    612}
    613
    614static void memcg_stats_unlock(void)
    615{
    616#ifdef CONFIG_PREEMPT_RT
    617      preempt_enable();
    618#endif
    619}
    620
    621static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
    622{
    623	unsigned int x;
    624
    625	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
    626
    627	x = __this_cpu_add_return(stats_updates, abs(val));
    628	if (x > MEMCG_CHARGE_BATCH) {
    629		atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
    630		__this_cpu_write(stats_updates, 0);
    631	}
    632}
    633
    634static void __mem_cgroup_flush_stats(void)
    635{
    636	unsigned long flag;
    637
    638	if (!spin_trylock_irqsave(&stats_flush_lock, flag))
    639		return;
    640
    641	flush_next_time = jiffies_64 + 2*FLUSH_TIME;
    642	cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
    643	atomic_set(&stats_flush_threshold, 0);
    644	spin_unlock_irqrestore(&stats_flush_lock, flag);
    645}
    646
    647void mem_cgroup_flush_stats(void)
    648{
    649	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
    650		__mem_cgroup_flush_stats();
    651}
    652
    653void mem_cgroup_flush_stats_delayed(void)
    654{
    655	if (time_after64(jiffies_64, flush_next_time))
    656		mem_cgroup_flush_stats();
    657}
    658
    659static void flush_memcg_stats_dwork(struct work_struct *w)
    660{
    661	__mem_cgroup_flush_stats();
    662	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
    663}
    664
    665/**
    666 * __mod_memcg_state - update cgroup memory statistics
    667 * @memcg: the memory cgroup
    668 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
    669 * @val: delta to add to the counter, can be negative
    670 */
    671void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
    672{
    673	if (mem_cgroup_disabled())
    674		return;
    675
    676	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
    677	memcg_rstat_updated(memcg, val);
    678}
    679
    680/* idx can be of type enum memcg_stat_item or node_stat_item. */
    681static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
    682{
    683	long x = 0;
    684	int cpu;
    685
    686	for_each_possible_cpu(cpu)
    687		x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
    688#ifdef CONFIG_SMP
    689	if (x < 0)
    690		x = 0;
    691#endif
    692	return x;
    693}
    694
    695void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
    696			      int val)
    697{
    698	struct mem_cgroup_per_node *pn;
    699	struct mem_cgroup *memcg;
    700
    701	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
    702	memcg = pn->memcg;
    703
    704	/*
    705	 * The caller from rmap relay on disabled preemption becase they never
    706	 * update their counter from in-interrupt context. For these two
    707	 * counters we check that the update is never performed from an
    708	 * interrupt context while other caller need to have disabled interrupt.
    709	 */
    710	__memcg_stats_lock();
    711	if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
    712		switch (idx) {
    713		case NR_ANON_MAPPED:
    714		case NR_FILE_MAPPED:
    715		case NR_ANON_THPS:
    716		case NR_SHMEM_PMDMAPPED:
    717		case NR_FILE_PMDMAPPED:
    718			WARN_ON_ONCE(!in_task());
    719			break;
    720		default:
    721			WARN_ON_ONCE(!irqs_disabled());
    722		}
    723	}
    724
    725	/* Update memcg */
    726	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
    727
    728	/* Update lruvec */
    729	__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
    730
    731	memcg_rstat_updated(memcg, val);
    732	memcg_stats_unlock();
    733}
    734
    735/**
    736 * __mod_lruvec_state - update lruvec memory statistics
    737 * @lruvec: the lruvec
    738 * @idx: the stat item
    739 * @val: delta to add to the counter, can be negative
    740 *
    741 * The lruvec is the intersection of the NUMA node and a cgroup. This
    742 * function updates the all three counters that are affected by a
    743 * change of state at this level: per-node, per-cgroup, per-lruvec.
    744 */
    745void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
    746			int val)
    747{
    748	/* Update node */
    749	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
    750
    751	/* Update memcg and lruvec */
    752	if (!mem_cgroup_disabled())
    753		__mod_memcg_lruvec_state(lruvec, idx, val);
    754}
    755
    756void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
    757			     int val)
    758{
    759	struct page *head = compound_head(page); /* rmap on tail pages */
    760	struct mem_cgroup *memcg;
    761	pg_data_t *pgdat = page_pgdat(page);
    762	struct lruvec *lruvec;
    763
    764	rcu_read_lock();
    765	memcg = page_memcg(head);
    766	/* Untracked pages have no memcg, no lruvec. Update only the node */
    767	if (!memcg) {
    768		rcu_read_unlock();
    769		__mod_node_page_state(pgdat, idx, val);
    770		return;
    771	}
    772
    773	lruvec = mem_cgroup_lruvec(memcg, pgdat);
    774	__mod_lruvec_state(lruvec, idx, val);
    775	rcu_read_unlock();
    776}
    777EXPORT_SYMBOL(__mod_lruvec_page_state);
    778
    779void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
    780{
    781	pg_data_t *pgdat = page_pgdat(virt_to_page(p));
    782	struct mem_cgroup *memcg;
    783	struct lruvec *lruvec;
    784
    785	rcu_read_lock();
    786	memcg = mem_cgroup_from_obj(p);
    787
    788	/*
    789	 * Untracked pages have no memcg, no lruvec. Update only the
    790	 * node. If we reparent the slab objects to the root memcg,
    791	 * when we free the slab object, we need to update the per-memcg
    792	 * vmstats to keep it correct for the root memcg.
    793	 */
    794	if (!memcg) {
    795		__mod_node_page_state(pgdat, idx, val);
    796	} else {
    797		lruvec = mem_cgroup_lruvec(memcg, pgdat);
    798		__mod_lruvec_state(lruvec, idx, val);
    799	}
    800	rcu_read_unlock();
    801}
    802
    803/**
    804 * __count_memcg_events - account VM events in a cgroup
    805 * @memcg: the memory cgroup
    806 * @idx: the event item
    807 * @count: the number of events that occurred
    808 */
    809void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
    810			  unsigned long count)
    811{
    812	if (mem_cgroup_disabled())
    813		return;
    814
    815	memcg_stats_lock();
    816	__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
    817	memcg_rstat_updated(memcg, count);
    818	memcg_stats_unlock();
    819}
    820
    821static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
    822{
    823	return READ_ONCE(memcg->vmstats.events[event]);
    824}
    825
    826static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
    827{
    828	long x = 0;
    829	int cpu;
    830
    831	for_each_possible_cpu(cpu)
    832		x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
    833	return x;
    834}
    835
    836static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
    837					 int nr_pages)
    838{
    839	/* pagein of a big page is an event. So, ignore page size */
    840	if (nr_pages > 0)
    841		__count_memcg_events(memcg, PGPGIN, 1);
    842	else {
    843		__count_memcg_events(memcg, PGPGOUT, 1);
    844		nr_pages = -nr_pages; /* for event */
    845	}
    846
    847	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
    848}
    849
    850static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
    851				       enum mem_cgroup_events_target target)
    852{
    853	unsigned long val, next;
    854
    855	val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
    856	next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
    857	/* from time_after() in jiffies.h */
    858	if ((long)(next - val) < 0) {
    859		switch (target) {
    860		case MEM_CGROUP_TARGET_THRESH:
    861			next = val + THRESHOLDS_EVENTS_TARGET;
    862			break;
    863		case MEM_CGROUP_TARGET_SOFTLIMIT:
    864			next = val + SOFTLIMIT_EVENTS_TARGET;
    865			break;
    866		default:
    867			break;
    868		}
    869		__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
    870		return true;
    871	}
    872	return false;
    873}
    874
    875/*
    876 * Check events in order.
    877 *
    878 */
    879static void memcg_check_events(struct mem_cgroup *memcg, int nid)
    880{
    881	if (IS_ENABLED(CONFIG_PREEMPT_RT))
    882		return;
    883
    884	/* threshold event is triggered in finer grain than soft limit */
    885	if (unlikely(mem_cgroup_event_ratelimit(memcg,
    886						MEM_CGROUP_TARGET_THRESH))) {
    887		bool do_softlimit;
    888
    889		do_softlimit = mem_cgroup_event_ratelimit(memcg,
    890						MEM_CGROUP_TARGET_SOFTLIMIT);
    891		mem_cgroup_threshold(memcg);
    892		if (unlikely(do_softlimit))
    893			mem_cgroup_update_tree(memcg, nid);
    894	}
    895}
    896
    897struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
    898{
    899	/*
    900	 * mm_update_next_owner() may clear mm->owner to NULL
    901	 * if it races with swapoff, page migration, etc.
    902	 * So this can be called with p == NULL.
    903	 */
    904	if (unlikely(!p))
    905		return NULL;
    906
    907	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
    908}
    909EXPORT_SYMBOL(mem_cgroup_from_task);
    910
    911static __always_inline struct mem_cgroup *active_memcg(void)
    912{
    913	if (!in_task())
    914		return this_cpu_read(int_active_memcg);
    915	else
    916		return current->active_memcg;
    917}
    918
    919/**
    920 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
    921 * @mm: mm from which memcg should be extracted. It can be NULL.
    922 *
    923 * Obtain a reference on mm->memcg and returns it if successful. If mm
    924 * is NULL, then the memcg is chosen as follows:
    925 * 1) The active memcg, if set.
    926 * 2) current->mm->memcg, if available
    927 * 3) root memcg
    928 * If mem_cgroup is disabled, NULL is returned.
    929 */
    930struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
    931{
    932	struct mem_cgroup *memcg;
    933
    934	if (mem_cgroup_disabled())
    935		return NULL;
    936
    937	/*
    938	 * Page cache insertions can happen without an
    939	 * actual mm context, e.g. during disk probing
    940	 * on boot, loopback IO, acct() writes etc.
    941	 *
    942	 * No need to css_get on root memcg as the reference
    943	 * counting is disabled on the root level in the
    944	 * cgroup core. See CSS_NO_REF.
    945	 */
    946	if (unlikely(!mm)) {
    947		memcg = active_memcg();
    948		if (unlikely(memcg)) {
    949			/* remote memcg must hold a ref */
    950			css_get(&memcg->css);
    951			return memcg;
    952		}
    953		mm = current->mm;
    954		if (unlikely(!mm))
    955			return root_mem_cgroup;
    956	}
    957
    958	rcu_read_lock();
    959	do {
    960		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
    961		if (unlikely(!memcg))
    962			memcg = root_mem_cgroup;
    963	} while (!css_tryget(&memcg->css));
    964	rcu_read_unlock();
    965	return memcg;
    966}
    967EXPORT_SYMBOL(get_mem_cgroup_from_mm);
    968
    969static __always_inline bool memcg_kmem_bypass(void)
    970{
    971	/* Allow remote memcg charging from any context. */
    972	if (unlikely(active_memcg()))
    973		return false;
    974
    975	/* Memcg to charge can't be determined. */
    976	if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
    977		return true;
    978
    979	return false;
    980}
    981
    982/**
    983 * mem_cgroup_iter - iterate over memory cgroup hierarchy
    984 * @root: hierarchy root
    985 * @prev: previously returned memcg, NULL on first invocation
    986 * @reclaim: cookie for shared reclaim walks, NULL for full walks
    987 *
    988 * Returns references to children of the hierarchy below @root, or
    989 * @root itself, or %NULL after a full round-trip.
    990 *
    991 * Caller must pass the return value in @prev on subsequent
    992 * invocations for reference counting, or use mem_cgroup_iter_break()
    993 * to cancel a hierarchy walk before the round-trip is complete.
    994 *
    995 * Reclaimers can specify a node in @reclaim to divide up the memcgs
    996 * in the hierarchy among all concurrent reclaimers operating on the
    997 * same node.
    998 */
    999struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
   1000				   struct mem_cgroup *prev,
   1001				   struct mem_cgroup_reclaim_cookie *reclaim)
   1002{
   1003	struct mem_cgroup_reclaim_iter *iter;
   1004	struct cgroup_subsys_state *css = NULL;
   1005	struct mem_cgroup *memcg = NULL;
   1006	struct mem_cgroup *pos = NULL;
   1007
   1008	if (mem_cgroup_disabled())
   1009		return NULL;
   1010
   1011	if (!root)
   1012		root = root_mem_cgroup;
   1013
   1014	rcu_read_lock();
   1015
   1016	if (reclaim) {
   1017		struct mem_cgroup_per_node *mz;
   1018
   1019		mz = root->nodeinfo[reclaim->pgdat->node_id];
   1020		iter = &mz->iter;
   1021
   1022		/*
   1023		 * On start, join the current reclaim iteration cycle.
   1024		 * Exit when a concurrent walker completes it.
   1025		 */
   1026		if (!prev)
   1027			reclaim->generation = iter->generation;
   1028		else if (reclaim->generation != iter->generation)
   1029			goto out_unlock;
   1030
   1031		while (1) {
   1032			pos = READ_ONCE(iter->position);
   1033			if (!pos || css_tryget(&pos->css))
   1034				break;
   1035			/*
   1036			 * css reference reached zero, so iter->position will
   1037			 * be cleared by ->css_released. However, we should not
   1038			 * rely on this happening soon, because ->css_released
   1039			 * is called from a work queue, and by busy-waiting we
   1040			 * might block it. So we clear iter->position right
   1041			 * away.
   1042			 */
   1043			(void)cmpxchg(&iter->position, pos, NULL);
   1044		}
   1045	} else if (prev) {
   1046		pos = prev;
   1047	}
   1048
   1049	if (pos)
   1050		css = &pos->css;
   1051
   1052	for (;;) {
   1053		css = css_next_descendant_pre(css, &root->css);
   1054		if (!css) {
   1055			/*
   1056			 * Reclaimers share the hierarchy walk, and a
   1057			 * new one might jump in right at the end of
   1058			 * the hierarchy - make sure they see at least
   1059			 * one group and restart from the beginning.
   1060			 */
   1061			if (!prev)
   1062				continue;
   1063			break;
   1064		}
   1065
   1066		/*
   1067		 * Verify the css and acquire a reference.  The root
   1068		 * is provided by the caller, so we know it's alive
   1069		 * and kicking, and don't take an extra reference.
   1070		 */
   1071		if (css == &root->css || css_tryget(css)) {
   1072			memcg = mem_cgroup_from_css(css);
   1073			break;
   1074		}
   1075	}
   1076
   1077	if (reclaim) {
   1078		/*
   1079		 * The position could have already been updated by a competing
   1080		 * thread, so check that the value hasn't changed since we read
   1081		 * it to avoid reclaiming from the same cgroup twice.
   1082		 */
   1083		(void)cmpxchg(&iter->position, pos, memcg);
   1084
   1085		if (pos)
   1086			css_put(&pos->css);
   1087
   1088		if (!memcg)
   1089			iter->generation++;
   1090	}
   1091
   1092out_unlock:
   1093	rcu_read_unlock();
   1094	if (prev && prev != root)
   1095		css_put(&prev->css);
   1096
   1097	return memcg;
   1098}
   1099
   1100/**
   1101 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
   1102 * @root: hierarchy root
   1103 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
   1104 */
   1105void mem_cgroup_iter_break(struct mem_cgroup *root,
   1106			   struct mem_cgroup *prev)
   1107{
   1108	if (!root)
   1109		root = root_mem_cgroup;
   1110	if (prev && prev != root)
   1111		css_put(&prev->css);
   1112}
   1113
   1114static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
   1115					struct mem_cgroup *dead_memcg)
   1116{
   1117	struct mem_cgroup_reclaim_iter *iter;
   1118	struct mem_cgroup_per_node *mz;
   1119	int nid;
   1120
   1121	for_each_node(nid) {
   1122		mz = from->nodeinfo[nid];
   1123		iter = &mz->iter;
   1124		cmpxchg(&iter->position, dead_memcg, NULL);
   1125	}
   1126}
   1127
   1128static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
   1129{
   1130	struct mem_cgroup *memcg = dead_memcg;
   1131	struct mem_cgroup *last;
   1132
   1133	do {
   1134		__invalidate_reclaim_iterators(memcg, dead_memcg);
   1135		last = memcg;
   1136	} while ((memcg = parent_mem_cgroup(memcg)));
   1137
   1138	/*
   1139	 * When cgruop1 non-hierarchy mode is used,
   1140	 * parent_mem_cgroup() does not walk all the way up to the
   1141	 * cgroup root (root_mem_cgroup). So we have to handle
   1142	 * dead_memcg from cgroup root separately.
   1143	 */
   1144	if (last != root_mem_cgroup)
   1145		__invalidate_reclaim_iterators(root_mem_cgroup,
   1146						dead_memcg);
   1147}
   1148
   1149/**
   1150 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
   1151 * @memcg: hierarchy root
   1152 * @fn: function to call for each task
   1153 * @arg: argument passed to @fn
   1154 *
   1155 * This function iterates over tasks attached to @memcg or to any of its
   1156 * descendants and calls @fn for each task. If @fn returns a non-zero
   1157 * value, the function breaks the iteration loop and returns the value.
   1158 * Otherwise, it will iterate over all tasks and return 0.
   1159 *
   1160 * This function must not be called for the root memory cgroup.
   1161 */
   1162int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
   1163			  int (*fn)(struct task_struct *, void *), void *arg)
   1164{
   1165	struct mem_cgroup *iter;
   1166	int ret = 0;
   1167
   1168	BUG_ON(memcg == root_mem_cgroup);
   1169
   1170	for_each_mem_cgroup_tree(iter, memcg) {
   1171		struct css_task_iter it;
   1172		struct task_struct *task;
   1173
   1174		css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
   1175		while (!ret && (task = css_task_iter_next(&it)))
   1176			ret = fn(task, arg);
   1177		css_task_iter_end(&it);
   1178		if (ret) {
   1179			mem_cgroup_iter_break(memcg, iter);
   1180			break;
   1181		}
   1182	}
   1183	return ret;
   1184}
   1185
   1186#ifdef CONFIG_DEBUG_VM
   1187void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
   1188{
   1189	struct mem_cgroup *memcg;
   1190
   1191	if (mem_cgroup_disabled())
   1192		return;
   1193
   1194	memcg = folio_memcg(folio);
   1195
   1196	if (!memcg)
   1197		VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
   1198	else
   1199		VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
   1200}
   1201#endif
   1202
   1203/**
   1204 * folio_lruvec_lock - Lock the lruvec for a folio.
   1205 * @folio: Pointer to the folio.
   1206 *
   1207 * These functions are safe to use under any of the following conditions:
   1208 * - folio locked
   1209 * - folio_test_lru false
   1210 * - folio_memcg_lock()
   1211 * - folio frozen (refcount of 0)
   1212 *
   1213 * Return: The lruvec this folio is on with its lock held.
   1214 */
   1215struct lruvec *folio_lruvec_lock(struct folio *folio)
   1216{
   1217	struct lruvec *lruvec = folio_lruvec(folio);
   1218
   1219	spin_lock(&lruvec->lru_lock);
   1220	lruvec_memcg_debug(lruvec, folio);
   1221
   1222	return lruvec;
   1223}
   1224
   1225/**
   1226 * folio_lruvec_lock_irq - Lock the lruvec for a folio.
   1227 * @folio: Pointer to the folio.
   1228 *
   1229 * These functions are safe to use under any of the following conditions:
   1230 * - folio locked
   1231 * - folio_test_lru false
   1232 * - folio_memcg_lock()
   1233 * - folio frozen (refcount of 0)
   1234 *
   1235 * Return: The lruvec this folio is on with its lock held and interrupts
   1236 * disabled.
   1237 */
   1238struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
   1239{
   1240	struct lruvec *lruvec = folio_lruvec(folio);
   1241
   1242	spin_lock_irq(&lruvec->lru_lock);
   1243	lruvec_memcg_debug(lruvec, folio);
   1244
   1245	return lruvec;
   1246}
   1247
   1248/**
   1249 * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
   1250 * @folio: Pointer to the folio.
   1251 * @flags: Pointer to irqsave flags.
   1252 *
   1253 * These functions are safe to use under any of the following conditions:
   1254 * - folio locked
   1255 * - folio_test_lru false
   1256 * - folio_memcg_lock()
   1257 * - folio frozen (refcount of 0)
   1258 *
   1259 * Return: The lruvec this folio is on with its lock held and interrupts
   1260 * disabled.
   1261 */
   1262struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
   1263		unsigned long *flags)
   1264{
   1265	struct lruvec *lruvec = folio_lruvec(folio);
   1266
   1267	spin_lock_irqsave(&lruvec->lru_lock, *flags);
   1268	lruvec_memcg_debug(lruvec, folio);
   1269
   1270	return lruvec;
   1271}
   1272
   1273/**
   1274 * mem_cgroup_update_lru_size - account for adding or removing an lru page
   1275 * @lruvec: mem_cgroup per zone lru vector
   1276 * @lru: index of lru list the page is sitting on
   1277 * @zid: zone id of the accounted pages
   1278 * @nr_pages: positive when adding or negative when removing
   1279 *
   1280 * This function must be called under lru_lock, just before a page is added
   1281 * to or just after a page is removed from an lru list.
   1282 */
   1283void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
   1284				int zid, int nr_pages)
   1285{
   1286	struct mem_cgroup_per_node *mz;
   1287	unsigned long *lru_size;
   1288	long size;
   1289
   1290	if (mem_cgroup_disabled())
   1291		return;
   1292
   1293	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
   1294	lru_size = &mz->lru_zone_size[zid][lru];
   1295
   1296	if (nr_pages < 0)
   1297		*lru_size += nr_pages;
   1298
   1299	size = *lru_size;
   1300	if (WARN_ONCE(size < 0,
   1301		"%s(%p, %d, %d): lru_size %ld\n",
   1302		__func__, lruvec, lru, nr_pages, size)) {
   1303		VM_BUG_ON(1);
   1304		*lru_size = 0;
   1305	}
   1306
   1307	if (nr_pages > 0)
   1308		*lru_size += nr_pages;
   1309}
   1310
   1311/**
   1312 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
   1313 * @memcg: the memory cgroup
   1314 *
   1315 * Returns the maximum amount of memory @mem can be charged with, in
   1316 * pages.
   1317 */
   1318static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
   1319{
   1320	unsigned long margin = 0;
   1321	unsigned long count;
   1322	unsigned long limit;
   1323
   1324	count = page_counter_read(&memcg->memory);
   1325	limit = READ_ONCE(memcg->memory.max);
   1326	if (count < limit)
   1327		margin = limit - count;
   1328
   1329	if (do_memsw_account()) {
   1330		count = page_counter_read(&memcg->memsw);
   1331		limit = READ_ONCE(memcg->memsw.max);
   1332		if (count < limit)
   1333			margin = min(margin, limit - count);
   1334		else
   1335			margin = 0;
   1336	}
   1337
   1338	return margin;
   1339}
   1340
   1341/*
   1342 * A routine for checking "mem" is under move_account() or not.
   1343 *
   1344 * Checking a cgroup is mc.from or mc.to or under hierarchy of
   1345 * moving cgroups. This is for waiting at high-memory pressure
   1346 * caused by "move".
   1347 */
   1348static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
   1349{
   1350	struct mem_cgroup *from;
   1351	struct mem_cgroup *to;
   1352	bool ret = false;
   1353	/*
   1354	 * Unlike task_move routines, we access mc.to, mc.from not under
   1355	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
   1356	 */
   1357	spin_lock(&mc.lock);
   1358	from = mc.from;
   1359	to = mc.to;
   1360	if (!from)
   1361		goto unlock;
   1362
   1363	ret = mem_cgroup_is_descendant(from, memcg) ||
   1364		mem_cgroup_is_descendant(to, memcg);
   1365unlock:
   1366	spin_unlock(&mc.lock);
   1367	return ret;
   1368}
   1369
   1370static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
   1371{
   1372	if (mc.moving_task && current != mc.moving_task) {
   1373		if (mem_cgroup_under_move(memcg)) {
   1374			DEFINE_WAIT(wait);
   1375			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
   1376			/* moving charge context might have finished. */
   1377			if (mc.moving_task)
   1378				schedule();
   1379			finish_wait(&mc.waitq, &wait);
   1380			return true;
   1381		}
   1382	}
   1383	return false;
   1384}
   1385
   1386struct memory_stat {
   1387	const char *name;
   1388	unsigned int idx;
   1389};
   1390
   1391static const struct memory_stat memory_stats[] = {
   1392	{ "anon",			NR_ANON_MAPPED			},
   1393	{ "file",			NR_FILE_PAGES			},
   1394	{ "kernel",			MEMCG_KMEM			},
   1395	{ "kernel_stack",		NR_KERNEL_STACK_KB		},
   1396	{ "pagetables",			NR_PAGETABLE			},
   1397	{ "percpu",			MEMCG_PERCPU_B			},
   1398	{ "sock",			MEMCG_SOCK			},
   1399	{ "vmalloc",			MEMCG_VMALLOC			},
   1400	{ "shmem",			NR_SHMEM			},
   1401#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
   1402	{ "zswap",			MEMCG_ZSWAP_B			},
   1403	{ "zswapped",			MEMCG_ZSWAPPED			},
   1404#endif
   1405	{ "file_mapped",		NR_FILE_MAPPED			},
   1406	{ "file_dirty",			NR_FILE_DIRTY			},
   1407	{ "file_writeback",		NR_WRITEBACK			},
   1408#ifdef CONFIG_SWAP
   1409	{ "swapcached",			NR_SWAPCACHE			},
   1410#endif
   1411#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1412	{ "anon_thp",			NR_ANON_THPS			},
   1413	{ "file_thp",			NR_FILE_THPS			},
   1414	{ "shmem_thp",			NR_SHMEM_THPS			},
   1415#endif
   1416	{ "inactive_anon",		NR_INACTIVE_ANON		},
   1417	{ "active_anon",		NR_ACTIVE_ANON			},
   1418	{ "inactive_file",		NR_INACTIVE_FILE		},
   1419	{ "active_file",		NR_ACTIVE_FILE			},
   1420	{ "unevictable",		NR_UNEVICTABLE			},
   1421	{ "slab_reclaimable",		NR_SLAB_RECLAIMABLE_B		},
   1422	{ "slab_unreclaimable",		NR_SLAB_UNRECLAIMABLE_B		},
   1423
   1424	/* The memory events */
   1425	{ "workingset_refault_anon",	WORKINGSET_REFAULT_ANON		},
   1426	{ "workingset_refault_file",	WORKINGSET_REFAULT_FILE		},
   1427	{ "workingset_activate_anon",	WORKINGSET_ACTIVATE_ANON	},
   1428	{ "workingset_activate_file",	WORKINGSET_ACTIVATE_FILE	},
   1429	{ "workingset_restore_anon",	WORKINGSET_RESTORE_ANON		},
   1430	{ "workingset_restore_file",	WORKINGSET_RESTORE_FILE		},
   1431	{ "workingset_nodereclaim",	WORKINGSET_NODERECLAIM		},
   1432};
   1433
   1434/* Translate stat items to the correct unit for memory.stat output */
   1435static int memcg_page_state_unit(int item)
   1436{
   1437	switch (item) {
   1438	case MEMCG_PERCPU_B:
   1439	case MEMCG_ZSWAP_B:
   1440	case NR_SLAB_RECLAIMABLE_B:
   1441	case NR_SLAB_UNRECLAIMABLE_B:
   1442	case WORKINGSET_REFAULT_ANON:
   1443	case WORKINGSET_REFAULT_FILE:
   1444	case WORKINGSET_ACTIVATE_ANON:
   1445	case WORKINGSET_ACTIVATE_FILE:
   1446	case WORKINGSET_RESTORE_ANON:
   1447	case WORKINGSET_RESTORE_FILE:
   1448	case WORKINGSET_NODERECLAIM:
   1449		return 1;
   1450	case NR_KERNEL_STACK_KB:
   1451		return SZ_1K;
   1452	default:
   1453		return PAGE_SIZE;
   1454	}
   1455}
   1456
   1457static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
   1458						    int item)
   1459{
   1460	return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
   1461}
   1462
   1463static char *memory_stat_format(struct mem_cgroup *memcg)
   1464{
   1465	struct seq_buf s;
   1466	int i;
   1467
   1468	seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
   1469	if (!s.buffer)
   1470		return NULL;
   1471
   1472	/*
   1473	 * Provide statistics on the state of the memory subsystem as
   1474	 * well as cumulative event counters that show past behavior.
   1475	 *
   1476	 * This list is ordered following a combination of these gradients:
   1477	 * 1) generic big picture -> specifics and details
   1478	 * 2) reflecting userspace activity -> reflecting kernel heuristics
   1479	 *
   1480	 * Current memory state:
   1481	 */
   1482	mem_cgroup_flush_stats();
   1483
   1484	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
   1485		u64 size;
   1486
   1487		size = memcg_page_state_output(memcg, memory_stats[i].idx);
   1488		seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
   1489
   1490		if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
   1491			size += memcg_page_state_output(memcg,
   1492							NR_SLAB_RECLAIMABLE_B);
   1493			seq_buf_printf(&s, "slab %llu\n", size);
   1494		}
   1495	}
   1496
   1497	/* Accumulated memory events */
   1498
   1499	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
   1500		       memcg_events(memcg, PGFAULT));
   1501	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
   1502		       memcg_events(memcg, PGMAJFAULT));
   1503	seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
   1504		       memcg_events(memcg, PGREFILL));
   1505	seq_buf_printf(&s, "pgscan %lu\n",
   1506		       memcg_events(memcg, PGSCAN_KSWAPD) +
   1507		       memcg_events(memcg, PGSCAN_DIRECT));
   1508	seq_buf_printf(&s, "pgsteal %lu\n",
   1509		       memcg_events(memcg, PGSTEAL_KSWAPD) +
   1510		       memcg_events(memcg, PGSTEAL_DIRECT));
   1511	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
   1512		       memcg_events(memcg, PGACTIVATE));
   1513	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
   1514		       memcg_events(memcg, PGDEACTIVATE));
   1515	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
   1516		       memcg_events(memcg, PGLAZYFREE));
   1517	seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
   1518		       memcg_events(memcg, PGLAZYFREED));
   1519
   1520#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
   1521	seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN),
   1522		       memcg_events(memcg, ZSWPIN));
   1523	seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPOUT),
   1524		       memcg_events(memcg, ZSWPOUT));
   1525#endif
   1526
   1527#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1528	seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
   1529		       memcg_events(memcg, THP_FAULT_ALLOC));
   1530	seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
   1531		       memcg_events(memcg, THP_COLLAPSE_ALLOC));
   1532#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   1533
   1534	/* The above should easily fit into one page */
   1535	WARN_ON_ONCE(seq_buf_has_overflowed(&s));
   1536
   1537	return s.buffer;
   1538}
   1539
   1540#define K(x) ((x) << (PAGE_SHIFT-10))
   1541/**
   1542 * mem_cgroup_print_oom_context: Print OOM information relevant to
   1543 * memory controller.
   1544 * @memcg: The memory cgroup that went over limit
   1545 * @p: Task that is going to be killed
   1546 *
   1547 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
   1548 * enabled
   1549 */
   1550void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
   1551{
   1552	rcu_read_lock();
   1553
   1554	if (memcg) {
   1555		pr_cont(",oom_memcg=");
   1556		pr_cont_cgroup_path(memcg->css.cgroup);
   1557	} else
   1558		pr_cont(",global_oom");
   1559	if (p) {
   1560		pr_cont(",task_memcg=");
   1561		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
   1562	}
   1563	rcu_read_unlock();
   1564}
   1565
   1566/**
   1567 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
   1568 * memory controller.
   1569 * @memcg: The memory cgroup that went over limit
   1570 */
   1571void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
   1572{
   1573	char *buf;
   1574
   1575	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
   1576		K((u64)page_counter_read(&memcg->memory)),
   1577		K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
   1578	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
   1579		pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
   1580			K((u64)page_counter_read(&memcg->swap)),
   1581			K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
   1582	else {
   1583		pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
   1584			K((u64)page_counter_read(&memcg->memsw)),
   1585			K((u64)memcg->memsw.max), memcg->memsw.failcnt);
   1586		pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
   1587			K((u64)page_counter_read(&memcg->kmem)),
   1588			K((u64)memcg->kmem.max), memcg->kmem.failcnt);
   1589	}
   1590
   1591	pr_info("Memory cgroup stats for ");
   1592	pr_cont_cgroup_path(memcg->css.cgroup);
   1593	pr_cont(":");
   1594	buf = memory_stat_format(memcg);
   1595	if (!buf)
   1596		return;
   1597	pr_info("%s", buf);
   1598	kfree(buf);
   1599}
   1600
   1601/*
   1602 * Return the memory (and swap, if configured) limit for a memcg.
   1603 */
   1604unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
   1605{
   1606	unsigned long max = READ_ONCE(memcg->memory.max);
   1607
   1608	if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
   1609		if (mem_cgroup_swappiness(memcg))
   1610			max += min(READ_ONCE(memcg->swap.max),
   1611				   (unsigned long)total_swap_pages);
   1612	} else { /* v1 */
   1613		if (mem_cgroup_swappiness(memcg)) {
   1614			/* Calculate swap excess capacity from memsw limit */
   1615			unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
   1616
   1617			max += min(swap, (unsigned long)total_swap_pages);
   1618		}
   1619	}
   1620	return max;
   1621}
   1622
   1623unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
   1624{
   1625	return page_counter_read(&memcg->memory);
   1626}
   1627
   1628static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
   1629				     int order)
   1630{
   1631	struct oom_control oc = {
   1632		.zonelist = NULL,
   1633		.nodemask = NULL,
   1634		.memcg = memcg,
   1635		.gfp_mask = gfp_mask,
   1636		.order = order,
   1637	};
   1638	bool ret = true;
   1639
   1640	if (mutex_lock_killable(&oom_lock))
   1641		return true;
   1642
   1643	if (mem_cgroup_margin(memcg) >= (1 << order))
   1644		goto unlock;
   1645
   1646	/*
   1647	 * A few threads which were not waiting at mutex_lock_killable() can
   1648	 * fail to bail out. Therefore, check again after holding oom_lock.
   1649	 */
   1650	ret = task_is_dying() || out_of_memory(&oc);
   1651
   1652unlock:
   1653	mutex_unlock(&oom_lock);
   1654	return ret;
   1655}
   1656
   1657static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
   1658				   pg_data_t *pgdat,
   1659				   gfp_t gfp_mask,
   1660				   unsigned long *total_scanned)
   1661{
   1662	struct mem_cgroup *victim = NULL;
   1663	int total = 0;
   1664	int loop = 0;
   1665	unsigned long excess;
   1666	unsigned long nr_scanned;
   1667	struct mem_cgroup_reclaim_cookie reclaim = {
   1668		.pgdat = pgdat,
   1669	};
   1670
   1671	excess = soft_limit_excess(root_memcg);
   1672
   1673	while (1) {
   1674		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
   1675		if (!victim) {
   1676			loop++;
   1677			if (loop >= 2) {
   1678				/*
   1679				 * If we have not been able to reclaim
   1680				 * anything, it might because there are
   1681				 * no reclaimable pages under this hierarchy
   1682				 */
   1683				if (!total)
   1684					break;
   1685				/*
   1686				 * We want to do more targeted reclaim.
   1687				 * excess >> 2 is not to excessive so as to
   1688				 * reclaim too much, nor too less that we keep
   1689				 * coming back to reclaim from this cgroup
   1690				 */
   1691				if (total >= (excess >> 2) ||
   1692					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
   1693					break;
   1694			}
   1695			continue;
   1696		}
   1697		total += mem_cgroup_shrink_node(victim, gfp_mask, false,
   1698					pgdat, &nr_scanned);
   1699		*total_scanned += nr_scanned;
   1700		if (!soft_limit_excess(root_memcg))
   1701			break;
   1702	}
   1703	mem_cgroup_iter_break(root_memcg, victim);
   1704	return total;
   1705}
   1706
   1707#ifdef CONFIG_LOCKDEP
   1708static struct lockdep_map memcg_oom_lock_dep_map = {
   1709	.name = "memcg_oom_lock",
   1710};
   1711#endif
   1712
   1713static DEFINE_SPINLOCK(memcg_oom_lock);
   1714
   1715/*
   1716 * Check OOM-Killer is already running under our hierarchy.
   1717 * If someone is running, return false.
   1718 */
   1719static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
   1720{
   1721	struct mem_cgroup *iter, *failed = NULL;
   1722
   1723	spin_lock(&memcg_oom_lock);
   1724
   1725	for_each_mem_cgroup_tree(iter, memcg) {
   1726		if (iter->oom_lock) {
   1727			/*
   1728			 * this subtree of our hierarchy is already locked
   1729			 * so we cannot give a lock.
   1730			 */
   1731			failed = iter;
   1732			mem_cgroup_iter_break(memcg, iter);
   1733			break;
   1734		} else
   1735			iter->oom_lock = true;
   1736	}
   1737
   1738	if (failed) {
   1739		/*
   1740		 * OK, we failed to lock the whole subtree so we have
   1741		 * to clean up what we set up to the failing subtree
   1742		 */
   1743		for_each_mem_cgroup_tree(iter, memcg) {
   1744			if (iter == failed) {
   1745				mem_cgroup_iter_break(memcg, iter);
   1746				break;
   1747			}
   1748			iter->oom_lock = false;
   1749		}
   1750	} else
   1751		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
   1752
   1753	spin_unlock(&memcg_oom_lock);
   1754
   1755	return !failed;
   1756}
   1757
   1758static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
   1759{
   1760	struct mem_cgroup *iter;
   1761
   1762	spin_lock(&memcg_oom_lock);
   1763	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
   1764	for_each_mem_cgroup_tree(iter, memcg)
   1765		iter->oom_lock = false;
   1766	spin_unlock(&memcg_oom_lock);
   1767}
   1768
   1769static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
   1770{
   1771	struct mem_cgroup *iter;
   1772
   1773	spin_lock(&memcg_oom_lock);
   1774	for_each_mem_cgroup_tree(iter, memcg)
   1775		iter->under_oom++;
   1776	spin_unlock(&memcg_oom_lock);
   1777}
   1778
   1779static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
   1780{
   1781	struct mem_cgroup *iter;
   1782
   1783	/*
   1784	 * Be careful about under_oom underflows because a child memcg
   1785	 * could have been added after mem_cgroup_mark_under_oom.
   1786	 */
   1787	spin_lock(&memcg_oom_lock);
   1788	for_each_mem_cgroup_tree(iter, memcg)
   1789		if (iter->under_oom > 0)
   1790			iter->under_oom--;
   1791	spin_unlock(&memcg_oom_lock);
   1792}
   1793
   1794static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
   1795
   1796struct oom_wait_info {
   1797	struct mem_cgroup *memcg;
   1798	wait_queue_entry_t	wait;
   1799};
   1800
   1801static int memcg_oom_wake_function(wait_queue_entry_t *wait,
   1802	unsigned mode, int sync, void *arg)
   1803{
   1804	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
   1805	struct mem_cgroup *oom_wait_memcg;
   1806	struct oom_wait_info *oom_wait_info;
   1807
   1808	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
   1809	oom_wait_memcg = oom_wait_info->memcg;
   1810
   1811	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
   1812	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
   1813		return 0;
   1814	return autoremove_wake_function(wait, mode, sync, arg);
   1815}
   1816
   1817static void memcg_oom_recover(struct mem_cgroup *memcg)
   1818{
   1819	/*
   1820	 * For the following lockless ->under_oom test, the only required
   1821	 * guarantee is that it must see the state asserted by an OOM when
   1822	 * this function is called as a result of userland actions
   1823	 * triggered by the notification of the OOM.  This is trivially
   1824	 * achieved by invoking mem_cgroup_mark_under_oom() before
   1825	 * triggering notification.
   1826	 */
   1827	if (memcg && memcg->under_oom)
   1828		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
   1829}
   1830
   1831/*
   1832 * Returns true if successfully killed one or more processes. Though in some
   1833 * corner cases it can return true even without killing any process.
   1834 */
   1835static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
   1836{
   1837	bool locked, ret;
   1838
   1839	if (order > PAGE_ALLOC_COSTLY_ORDER)
   1840		return false;
   1841
   1842	memcg_memory_event(memcg, MEMCG_OOM);
   1843
   1844	/*
   1845	 * We are in the middle of the charge context here, so we
   1846	 * don't want to block when potentially sitting on a callstack
   1847	 * that holds all kinds of filesystem and mm locks.
   1848	 *
   1849	 * cgroup1 allows disabling the OOM killer and waiting for outside
   1850	 * handling until the charge can succeed; remember the context and put
   1851	 * the task to sleep at the end of the page fault when all locks are
   1852	 * released.
   1853	 *
   1854	 * On the other hand, in-kernel OOM killer allows for an async victim
   1855	 * memory reclaim (oom_reaper) and that means that we are not solely
   1856	 * relying on the oom victim to make a forward progress and we can
   1857	 * invoke the oom killer here.
   1858	 *
   1859	 * Please note that mem_cgroup_out_of_memory might fail to find a
   1860	 * victim and then we have to bail out from the charge path.
   1861	 */
   1862	if (memcg->oom_kill_disable) {
   1863		if (current->in_user_fault) {
   1864			css_get(&memcg->css);
   1865			current->memcg_in_oom = memcg;
   1866			current->memcg_oom_gfp_mask = mask;
   1867			current->memcg_oom_order = order;
   1868		}
   1869		return false;
   1870	}
   1871
   1872	mem_cgroup_mark_under_oom(memcg);
   1873
   1874	locked = mem_cgroup_oom_trylock(memcg);
   1875
   1876	if (locked)
   1877		mem_cgroup_oom_notify(memcg);
   1878
   1879	mem_cgroup_unmark_under_oom(memcg);
   1880	ret = mem_cgroup_out_of_memory(memcg, mask, order);
   1881
   1882	if (locked)
   1883		mem_cgroup_oom_unlock(memcg);
   1884
   1885	return ret;
   1886}
   1887
   1888/**
   1889 * mem_cgroup_oom_synchronize - complete memcg OOM handling
   1890 * @handle: actually kill/wait or just clean up the OOM state
   1891 *
   1892 * This has to be called at the end of a page fault if the memcg OOM
   1893 * handler was enabled.
   1894 *
   1895 * Memcg supports userspace OOM handling where failed allocations must
   1896 * sleep on a waitqueue until the userspace task resolves the
   1897 * situation.  Sleeping directly in the charge context with all kinds
   1898 * of locks held is not a good idea, instead we remember an OOM state
   1899 * in the task and mem_cgroup_oom_synchronize() has to be called at
   1900 * the end of the page fault to complete the OOM handling.
   1901 *
   1902 * Returns %true if an ongoing memcg OOM situation was detected and
   1903 * completed, %false otherwise.
   1904 */
   1905bool mem_cgroup_oom_synchronize(bool handle)
   1906{
   1907	struct mem_cgroup *memcg = current->memcg_in_oom;
   1908	struct oom_wait_info owait;
   1909	bool locked;
   1910
   1911	/* OOM is global, do not handle */
   1912	if (!memcg)
   1913		return false;
   1914
   1915	if (!handle)
   1916		goto cleanup;
   1917
   1918	owait.memcg = memcg;
   1919	owait.wait.flags = 0;
   1920	owait.wait.func = memcg_oom_wake_function;
   1921	owait.wait.private = current;
   1922	INIT_LIST_HEAD(&owait.wait.entry);
   1923
   1924	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
   1925	mem_cgroup_mark_under_oom(memcg);
   1926
   1927	locked = mem_cgroup_oom_trylock(memcg);
   1928
   1929	if (locked)
   1930		mem_cgroup_oom_notify(memcg);
   1931
   1932	if (locked && !memcg->oom_kill_disable) {
   1933		mem_cgroup_unmark_under_oom(memcg);
   1934		finish_wait(&memcg_oom_waitq, &owait.wait);
   1935		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
   1936					 current->memcg_oom_order);
   1937	} else {
   1938		schedule();
   1939		mem_cgroup_unmark_under_oom(memcg);
   1940		finish_wait(&memcg_oom_waitq, &owait.wait);
   1941	}
   1942
   1943	if (locked) {
   1944		mem_cgroup_oom_unlock(memcg);
   1945		/*
   1946		 * There is no guarantee that an OOM-lock contender
   1947		 * sees the wakeups triggered by the OOM kill
   1948		 * uncharges.  Wake any sleepers explicitly.
   1949		 */
   1950		memcg_oom_recover(memcg);
   1951	}
   1952cleanup:
   1953	current->memcg_in_oom = NULL;
   1954	css_put(&memcg->css);
   1955	return true;
   1956}
   1957
   1958/**
   1959 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
   1960 * @victim: task to be killed by the OOM killer
   1961 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
   1962 *
   1963 * Returns a pointer to a memory cgroup, which has to be cleaned up
   1964 * by killing all belonging OOM-killable tasks.
   1965 *
   1966 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
   1967 */
   1968struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
   1969					    struct mem_cgroup *oom_domain)
   1970{
   1971	struct mem_cgroup *oom_group = NULL;
   1972	struct mem_cgroup *memcg;
   1973
   1974	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
   1975		return NULL;
   1976
   1977	if (!oom_domain)
   1978		oom_domain = root_mem_cgroup;
   1979
   1980	rcu_read_lock();
   1981
   1982	memcg = mem_cgroup_from_task(victim);
   1983	if (memcg == root_mem_cgroup)
   1984		goto out;
   1985
   1986	/*
   1987	 * If the victim task has been asynchronously moved to a different
   1988	 * memory cgroup, we might end up killing tasks outside oom_domain.
   1989	 * In this case it's better to ignore memory.group.oom.
   1990	 */
   1991	if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
   1992		goto out;
   1993
   1994	/*
   1995	 * Traverse the memory cgroup hierarchy from the victim task's
   1996	 * cgroup up to the OOMing cgroup (or root) to find the
   1997	 * highest-level memory cgroup with oom.group set.
   1998	 */
   1999	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
   2000		if (memcg->oom_group)
   2001			oom_group = memcg;
   2002
   2003		if (memcg == oom_domain)
   2004			break;
   2005	}
   2006
   2007	if (oom_group)
   2008		css_get(&oom_group->css);
   2009out:
   2010	rcu_read_unlock();
   2011
   2012	return oom_group;
   2013}
   2014
   2015void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
   2016{
   2017	pr_info("Tasks in ");
   2018	pr_cont_cgroup_path(memcg->css.cgroup);
   2019	pr_cont(" are going to be killed due to memory.oom.group set\n");
   2020}
   2021
   2022/**
   2023 * folio_memcg_lock - Bind a folio to its memcg.
   2024 * @folio: The folio.
   2025 *
   2026 * This function prevents unlocked LRU folios from being moved to
   2027 * another cgroup.
   2028 *
   2029 * It ensures lifetime of the bound memcg.  The caller is responsible
   2030 * for the lifetime of the folio.
   2031 */
   2032void folio_memcg_lock(struct folio *folio)
   2033{
   2034	struct mem_cgroup *memcg;
   2035	unsigned long flags;
   2036
   2037	/*
   2038	 * The RCU lock is held throughout the transaction.  The fast
   2039	 * path can get away without acquiring the memcg->move_lock
   2040	 * because page moving starts with an RCU grace period.
   2041         */
   2042	rcu_read_lock();
   2043
   2044	if (mem_cgroup_disabled())
   2045		return;
   2046again:
   2047	memcg = folio_memcg(folio);
   2048	if (unlikely(!memcg))
   2049		return;
   2050
   2051#ifdef CONFIG_PROVE_LOCKING
   2052	local_irq_save(flags);
   2053	might_lock(&memcg->move_lock);
   2054	local_irq_restore(flags);
   2055#endif
   2056
   2057	if (atomic_read(&memcg->moving_account) <= 0)
   2058		return;
   2059
   2060	spin_lock_irqsave(&memcg->move_lock, flags);
   2061	if (memcg != folio_memcg(folio)) {
   2062		spin_unlock_irqrestore(&memcg->move_lock, flags);
   2063		goto again;
   2064	}
   2065
   2066	/*
   2067	 * When charge migration first begins, we can have multiple
   2068	 * critical sections holding the fast-path RCU lock and one
   2069	 * holding the slowpath move_lock. Track the task who has the
   2070	 * move_lock for unlock_page_memcg().
   2071	 */
   2072	memcg->move_lock_task = current;
   2073	memcg->move_lock_flags = flags;
   2074}
   2075
   2076void lock_page_memcg(struct page *page)
   2077{
   2078	folio_memcg_lock(page_folio(page));
   2079}
   2080
   2081static void __folio_memcg_unlock(struct mem_cgroup *memcg)
   2082{
   2083	if (memcg && memcg->move_lock_task == current) {
   2084		unsigned long flags = memcg->move_lock_flags;
   2085
   2086		memcg->move_lock_task = NULL;
   2087		memcg->move_lock_flags = 0;
   2088
   2089		spin_unlock_irqrestore(&memcg->move_lock, flags);
   2090	}
   2091
   2092	rcu_read_unlock();
   2093}
   2094
   2095/**
   2096 * folio_memcg_unlock - Release the binding between a folio and its memcg.
   2097 * @folio: The folio.
   2098 *
   2099 * This releases the binding created by folio_memcg_lock().  This does
   2100 * not change the accounting of this folio to its memcg, but it does
   2101 * permit others to change it.
   2102 */
   2103void folio_memcg_unlock(struct folio *folio)
   2104{
   2105	__folio_memcg_unlock(folio_memcg(folio));
   2106}
   2107
   2108void unlock_page_memcg(struct page *page)
   2109{
   2110	folio_memcg_unlock(page_folio(page));
   2111}
   2112
   2113struct memcg_stock_pcp {
   2114	local_lock_t stock_lock;
   2115	struct mem_cgroup *cached; /* this never be root cgroup */
   2116	unsigned int nr_pages;
   2117
   2118#ifdef CONFIG_MEMCG_KMEM
   2119	struct obj_cgroup *cached_objcg;
   2120	struct pglist_data *cached_pgdat;
   2121	unsigned int nr_bytes;
   2122	int nr_slab_reclaimable_b;
   2123	int nr_slab_unreclaimable_b;
   2124#endif
   2125
   2126	struct work_struct work;
   2127	unsigned long flags;
   2128#define FLUSHING_CACHED_CHARGE	0
   2129};
   2130static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
   2131	.stock_lock = INIT_LOCAL_LOCK(stock_lock),
   2132};
   2133static DEFINE_MUTEX(percpu_charge_mutex);
   2134
   2135#ifdef CONFIG_MEMCG_KMEM
   2136static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
   2137static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
   2138				     struct mem_cgroup *root_memcg);
   2139static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
   2140
   2141#else
   2142static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
   2143{
   2144	return NULL;
   2145}
   2146static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
   2147				     struct mem_cgroup *root_memcg)
   2148{
   2149	return false;
   2150}
   2151static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
   2152{
   2153}
   2154#endif
   2155
   2156/**
   2157 * consume_stock: Try to consume stocked charge on this cpu.
   2158 * @memcg: memcg to consume from.
   2159 * @nr_pages: how many pages to charge.
   2160 *
   2161 * The charges will only happen if @memcg matches the current cpu's memcg
   2162 * stock, and at least @nr_pages are available in that stock.  Failure to
   2163 * service an allocation will refill the stock.
   2164 *
   2165 * returns true if successful, false otherwise.
   2166 */
   2167static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
   2168{
   2169	struct memcg_stock_pcp *stock;
   2170	unsigned long flags;
   2171	bool ret = false;
   2172
   2173	if (nr_pages > MEMCG_CHARGE_BATCH)
   2174		return ret;
   2175
   2176	local_lock_irqsave(&memcg_stock.stock_lock, flags);
   2177
   2178	stock = this_cpu_ptr(&memcg_stock);
   2179	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
   2180		stock->nr_pages -= nr_pages;
   2181		ret = true;
   2182	}
   2183
   2184	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
   2185
   2186	return ret;
   2187}
   2188
   2189/*
   2190 * Returns stocks cached in percpu and reset cached information.
   2191 */
   2192static void drain_stock(struct memcg_stock_pcp *stock)
   2193{
   2194	struct mem_cgroup *old = stock->cached;
   2195
   2196	if (!old)
   2197		return;
   2198
   2199	if (stock->nr_pages) {
   2200		page_counter_uncharge(&old->memory, stock->nr_pages);
   2201		if (do_memsw_account())
   2202			page_counter_uncharge(&old->memsw, stock->nr_pages);
   2203		stock->nr_pages = 0;
   2204	}
   2205
   2206	css_put(&old->css);
   2207	stock->cached = NULL;
   2208}
   2209
   2210static void drain_local_stock(struct work_struct *dummy)
   2211{
   2212	struct memcg_stock_pcp *stock;
   2213	struct obj_cgroup *old = NULL;
   2214	unsigned long flags;
   2215
   2216	/*
   2217	 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
   2218	 * drain_stock races is that we always operate on local CPU stock
   2219	 * here with IRQ disabled
   2220	 */
   2221	local_lock_irqsave(&memcg_stock.stock_lock, flags);
   2222
   2223	stock = this_cpu_ptr(&memcg_stock);
   2224	old = drain_obj_stock(stock);
   2225	drain_stock(stock);
   2226	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
   2227
   2228	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
   2229	if (old)
   2230		obj_cgroup_put(old);
   2231}
   2232
   2233/*
   2234 * Cache charges(val) to local per_cpu area.
   2235 * This will be consumed by consume_stock() function, later.
   2236 */
   2237static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
   2238{
   2239	struct memcg_stock_pcp *stock;
   2240
   2241	stock = this_cpu_ptr(&memcg_stock);
   2242	if (stock->cached != memcg) { /* reset if necessary */
   2243		drain_stock(stock);
   2244		css_get(&memcg->css);
   2245		stock->cached = memcg;
   2246	}
   2247	stock->nr_pages += nr_pages;
   2248
   2249	if (stock->nr_pages > MEMCG_CHARGE_BATCH)
   2250		drain_stock(stock);
   2251}
   2252
   2253static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
   2254{
   2255	unsigned long flags;
   2256
   2257	local_lock_irqsave(&memcg_stock.stock_lock, flags);
   2258	__refill_stock(memcg, nr_pages);
   2259	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
   2260}
   2261
   2262/*
   2263 * Drains all per-CPU charge caches for given root_memcg resp. subtree
   2264 * of the hierarchy under it.
   2265 */
   2266static void drain_all_stock(struct mem_cgroup *root_memcg)
   2267{
   2268	int cpu, curcpu;
   2269
   2270	/* If someone's already draining, avoid adding running more workers. */
   2271	if (!mutex_trylock(&percpu_charge_mutex))
   2272		return;
   2273	/*
   2274	 * Notify other cpus that system-wide "drain" is running
   2275	 * We do not care about races with the cpu hotplug because cpu down
   2276	 * as well as workers from this path always operate on the local
   2277	 * per-cpu data. CPU up doesn't touch memcg_stock at all.
   2278	 */
   2279	migrate_disable();
   2280	curcpu = smp_processor_id();
   2281	for_each_online_cpu(cpu) {
   2282		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
   2283		struct mem_cgroup *memcg;
   2284		bool flush = false;
   2285
   2286		rcu_read_lock();
   2287		memcg = stock->cached;
   2288		if (memcg && stock->nr_pages &&
   2289		    mem_cgroup_is_descendant(memcg, root_memcg))
   2290			flush = true;
   2291		else if (obj_stock_flush_required(stock, root_memcg))
   2292			flush = true;
   2293		rcu_read_unlock();
   2294
   2295		if (flush &&
   2296		    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
   2297			if (cpu == curcpu)
   2298				drain_local_stock(&stock->work);
   2299			else
   2300				schedule_work_on(cpu, &stock->work);
   2301		}
   2302	}
   2303	migrate_enable();
   2304	mutex_unlock(&percpu_charge_mutex);
   2305}
   2306
   2307static int memcg_hotplug_cpu_dead(unsigned int cpu)
   2308{
   2309	struct memcg_stock_pcp *stock;
   2310
   2311	stock = &per_cpu(memcg_stock, cpu);
   2312	drain_stock(stock);
   2313
   2314	return 0;
   2315}
   2316
   2317static unsigned long reclaim_high(struct mem_cgroup *memcg,
   2318				  unsigned int nr_pages,
   2319				  gfp_t gfp_mask)
   2320{
   2321	unsigned long nr_reclaimed = 0;
   2322
   2323	do {
   2324		unsigned long pflags;
   2325
   2326		if (page_counter_read(&memcg->memory) <=
   2327		    READ_ONCE(memcg->memory.high))
   2328			continue;
   2329
   2330		memcg_memory_event(memcg, MEMCG_HIGH);
   2331
   2332		psi_memstall_enter(&pflags);
   2333		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
   2334							     gfp_mask, true);
   2335		psi_memstall_leave(&pflags);
   2336	} while ((memcg = parent_mem_cgroup(memcg)) &&
   2337		 !mem_cgroup_is_root(memcg));
   2338
   2339	return nr_reclaimed;
   2340}
   2341
   2342static void high_work_func(struct work_struct *work)
   2343{
   2344	struct mem_cgroup *memcg;
   2345
   2346	memcg = container_of(work, struct mem_cgroup, high_work);
   2347	reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
   2348}
   2349
   2350/*
   2351 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
   2352 * enough to still cause a significant slowdown in most cases, while still
   2353 * allowing diagnostics and tracing to proceed without becoming stuck.
   2354 */
   2355#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
   2356
   2357/*
   2358 * When calculating the delay, we use these either side of the exponentiation to
   2359 * maintain precision and scale to a reasonable number of jiffies (see the table
   2360 * below.
   2361 *
   2362 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
   2363 *   overage ratio to a delay.
   2364 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
   2365 *   proposed penalty in order to reduce to a reasonable number of jiffies, and
   2366 *   to produce a reasonable delay curve.
   2367 *
   2368 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
   2369 * reasonable delay curve compared to precision-adjusted overage, not
   2370 * penalising heavily at first, but still making sure that growth beyond the
   2371 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
   2372 * example, with a high of 100 megabytes:
   2373 *
   2374 *  +-------+------------------------+
   2375 *  | usage | time to allocate in ms |
   2376 *  +-------+------------------------+
   2377 *  | 100M  |                      0 |
   2378 *  | 101M  |                      6 |
   2379 *  | 102M  |                     25 |
   2380 *  | 103M  |                     57 |
   2381 *  | 104M  |                    102 |
   2382 *  | 105M  |                    159 |
   2383 *  | 106M  |                    230 |
   2384 *  | 107M  |                    313 |
   2385 *  | 108M  |                    409 |
   2386 *  | 109M  |                    518 |
   2387 *  | 110M  |                    639 |
   2388 *  | 111M  |                    774 |
   2389 *  | 112M  |                    921 |
   2390 *  | 113M  |                   1081 |
   2391 *  | 114M  |                   1254 |
   2392 *  | 115M  |                   1439 |
   2393 *  | 116M  |                   1638 |
   2394 *  | 117M  |                   1849 |
   2395 *  | 118M  |                   2000 |
   2396 *  | 119M  |                   2000 |
   2397 *  | 120M  |                   2000 |
   2398 *  +-------+------------------------+
   2399 */
   2400 #define MEMCG_DELAY_PRECISION_SHIFT 20
   2401 #define MEMCG_DELAY_SCALING_SHIFT 14
   2402
   2403static u64 calculate_overage(unsigned long usage, unsigned long high)
   2404{
   2405	u64 overage;
   2406
   2407	if (usage <= high)
   2408		return 0;
   2409
   2410	/*
   2411	 * Prevent division by 0 in overage calculation by acting as if
   2412	 * it was a threshold of 1 page
   2413	 */
   2414	high = max(high, 1UL);
   2415
   2416	overage = usage - high;
   2417	overage <<= MEMCG_DELAY_PRECISION_SHIFT;
   2418	return div64_u64(overage, high);
   2419}
   2420
   2421static u64 mem_find_max_overage(struct mem_cgroup *memcg)
   2422{
   2423	u64 overage, max_overage = 0;
   2424
   2425	do {
   2426		overage = calculate_overage(page_counter_read(&memcg->memory),
   2427					    READ_ONCE(memcg->memory.high));
   2428		max_overage = max(overage, max_overage);
   2429	} while ((memcg = parent_mem_cgroup(memcg)) &&
   2430		 !mem_cgroup_is_root(memcg));
   2431
   2432	return max_overage;
   2433}
   2434
   2435static u64 swap_find_max_overage(struct mem_cgroup *memcg)
   2436{
   2437	u64 overage, max_overage = 0;
   2438
   2439	do {
   2440		overage = calculate_overage(page_counter_read(&memcg->swap),
   2441					    READ_ONCE(memcg->swap.high));
   2442		if (overage)
   2443			memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
   2444		max_overage = max(overage, max_overage);
   2445	} while ((memcg = parent_mem_cgroup(memcg)) &&
   2446		 !mem_cgroup_is_root(memcg));
   2447
   2448	return max_overage;
   2449}
   2450
   2451/*
   2452 * Get the number of jiffies that we should penalise a mischievous cgroup which
   2453 * is exceeding its memory.high by checking both it and its ancestors.
   2454 */
   2455static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
   2456					  unsigned int nr_pages,
   2457					  u64 max_overage)
   2458{
   2459	unsigned long penalty_jiffies;
   2460
   2461	if (!max_overage)
   2462		return 0;
   2463
   2464	/*
   2465	 * We use overage compared to memory.high to calculate the number of
   2466	 * jiffies to sleep (penalty_jiffies). Ideally this value should be
   2467	 * fairly lenient on small overages, and increasingly harsh when the
   2468	 * memcg in question makes it clear that it has no intention of stopping
   2469	 * its crazy behaviour, so we exponentially increase the delay based on
   2470	 * overage amount.
   2471	 */
   2472	penalty_jiffies = max_overage * max_overage * HZ;
   2473	penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
   2474	penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
   2475
   2476	/*
   2477	 * Factor in the task's own contribution to the overage, such that four
   2478	 * N-sized allocations are throttled approximately the same as one
   2479	 * 4N-sized allocation.
   2480	 *
   2481	 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
   2482	 * larger the current charge patch is than that.
   2483	 */
   2484	return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
   2485}
   2486
   2487/*
   2488 * Scheduled by try_charge() to be executed from the userland return path
   2489 * and reclaims memory over the high limit.
   2490 */
   2491void mem_cgroup_handle_over_high(void)
   2492{
   2493	unsigned long penalty_jiffies;
   2494	unsigned long pflags;
   2495	unsigned long nr_reclaimed;
   2496	unsigned int nr_pages = current->memcg_nr_pages_over_high;
   2497	int nr_retries = MAX_RECLAIM_RETRIES;
   2498	struct mem_cgroup *memcg;
   2499	bool in_retry = false;
   2500
   2501	if (likely(!nr_pages))
   2502		return;
   2503
   2504	memcg = get_mem_cgroup_from_mm(current->mm);
   2505	current->memcg_nr_pages_over_high = 0;
   2506
   2507retry_reclaim:
   2508	/*
   2509	 * The allocating task should reclaim at least the batch size, but for
   2510	 * subsequent retries we only want to do what's necessary to prevent oom
   2511	 * or breaching resource isolation.
   2512	 *
   2513	 * This is distinct from memory.max or page allocator behaviour because
   2514	 * memory.high is currently batched, whereas memory.max and the page
   2515	 * allocator run every time an allocation is made.
   2516	 */
   2517	nr_reclaimed = reclaim_high(memcg,
   2518				    in_retry ? SWAP_CLUSTER_MAX : nr_pages,
   2519				    GFP_KERNEL);
   2520
   2521	/*
   2522	 * memory.high is breached and reclaim is unable to keep up. Throttle
   2523	 * allocators proactively to slow down excessive growth.
   2524	 */
   2525	penalty_jiffies = calculate_high_delay(memcg, nr_pages,
   2526					       mem_find_max_overage(memcg));
   2527
   2528	penalty_jiffies += calculate_high_delay(memcg, nr_pages,
   2529						swap_find_max_overage(memcg));
   2530
   2531	/*
   2532	 * Clamp the max delay per usermode return so as to still keep the
   2533	 * application moving forwards and also permit diagnostics, albeit
   2534	 * extremely slowly.
   2535	 */
   2536	penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
   2537
   2538	/*
   2539	 * Don't sleep if the amount of jiffies this memcg owes us is so low
   2540	 * that it's not even worth doing, in an attempt to be nice to those who
   2541	 * go only a small amount over their memory.high value and maybe haven't
   2542	 * been aggressively reclaimed enough yet.
   2543	 */
   2544	if (penalty_jiffies <= HZ / 100)
   2545		goto out;
   2546
   2547	/*
   2548	 * If reclaim is making forward progress but we're still over
   2549	 * memory.high, we want to encourage that rather than doing allocator
   2550	 * throttling.
   2551	 */
   2552	if (nr_reclaimed || nr_retries--) {
   2553		in_retry = true;
   2554		goto retry_reclaim;
   2555	}
   2556
   2557	/*
   2558	 * If we exit early, we're guaranteed to die (since
   2559	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
   2560	 * need to account for any ill-begotten jiffies to pay them off later.
   2561	 */
   2562	psi_memstall_enter(&pflags);
   2563	schedule_timeout_killable(penalty_jiffies);
   2564	psi_memstall_leave(&pflags);
   2565
   2566out:
   2567	css_put(&memcg->css);
   2568}
   2569
   2570static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
   2571			unsigned int nr_pages)
   2572{
   2573	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
   2574	int nr_retries = MAX_RECLAIM_RETRIES;
   2575	struct mem_cgroup *mem_over_limit;
   2576	struct page_counter *counter;
   2577	unsigned long nr_reclaimed;
   2578	bool passed_oom = false;
   2579	bool may_swap = true;
   2580	bool drained = false;
   2581	unsigned long pflags;
   2582
   2583retry:
   2584	if (consume_stock(memcg, nr_pages))
   2585		return 0;
   2586
   2587	if (!do_memsw_account() ||
   2588	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
   2589		if (page_counter_try_charge(&memcg->memory, batch, &counter))
   2590			goto done_restock;
   2591		if (do_memsw_account())
   2592			page_counter_uncharge(&memcg->memsw, batch);
   2593		mem_over_limit = mem_cgroup_from_counter(counter, memory);
   2594	} else {
   2595		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
   2596		may_swap = false;
   2597	}
   2598
   2599	if (batch > nr_pages) {
   2600		batch = nr_pages;
   2601		goto retry;
   2602	}
   2603
   2604	/*
   2605	 * Prevent unbounded recursion when reclaim operations need to
   2606	 * allocate memory. This might exceed the limits temporarily,
   2607	 * but we prefer facilitating memory reclaim and getting back
   2608	 * under the limit over triggering OOM kills in these cases.
   2609	 */
   2610	if (unlikely(current->flags & PF_MEMALLOC))
   2611		goto force;
   2612
   2613	if (unlikely(task_in_memcg_oom(current)))
   2614		goto nomem;
   2615
   2616	if (!gfpflags_allow_blocking(gfp_mask))
   2617		goto nomem;
   2618
   2619	memcg_memory_event(mem_over_limit, MEMCG_MAX);
   2620
   2621	psi_memstall_enter(&pflags);
   2622	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
   2623						    gfp_mask, may_swap);
   2624	psi_memstall_leave(&pflags);
   2625
   2626	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
   2627		goto retry;
   2628
   2629	if (!drained) {
   2630		drain_all_stock(mem_over_limit);
   2631		drained = true;
   2632		goto retry;
   2633	}
   2634
   2635	if (gfp_mask & __GFP_NORETRY)
   2636		goto nomem;
   2637	/*
   2638	 * Even though the limit is exceeded at this point, reclaim
   2639	 * may have been able to free some pages.  Retry the charge
   2640	 * before killing the task.
   2641	 *
   2642	 * Only for regular pages, though: huge pages are rather
   2643	 * unlikely to succeed so close to the limit, and we fall back
   2644	 * to regular pages anyway in case of failure.
   2645	 */
   2646	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
   2647		goto retry;
   2648	/*
   2649	 * At task move, charge accounts can be doubly counted. So, it's
   2650	 * better to wait until the end of task_move if something is going on.
   2651	 */
   2652	if (mem_cgroup_wait_acct_move(mem_over_limit))
   2653		goto retry;
   2654
   2655	if (nr_retries--)
   2656		goto retry;
   2657
   2658	if (gfp_mask & __GFP_RETRY_MAYFAIL)
   2659		goto nomem;
   2660
   2661	/* Avoid endless loop for tasks bypassed by the oom killer */
   2662	if (passed_oom && task_is_dying())
   2663		goto nomem;
   2664
   2665	/*
   2666	 * keep retrying as long as the memcg oom killer is able to make
   2667	 * a forward progress or bypass the charge if the oom killer
   2668	 * couldn't make any progress.
   2669	 */
   2670	if (mem_cgroup_oom(mem_over_limit, gfp_mask,
   2671			   get_order(nr_pages * PAGE_SIZE))) {
   2672		passed_oom = true;
   2673		nr_retries = MAX_RECLAIM_RETRIES;
   2674		goto retry;
   2675	}
   2676nomem:
   2677	/*
   2678	 * Memcg doesn't have a dedicated reserve for atomic
   2679	 * allocations. But like the global atomic pool, we need to
   2680	 * put the burden of reclaim on regular allocation requests
   2681	 * and let these go through as privileged allocations.
   2682	 */
   2683	if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
   2684		return -ENOMEM;
   2685force:
   2686	/*
   2687	 * The allocation either can't fail or will lead to more memory
   2688	 * being freed very soon.  Allow memory usage go over the limit
   2689	 * temporarily by force charging it.
   2690	 */
   2691	page_counter_charge(&memcg->memory, nr_pages);
   2692	if (do_memsw_account())
   2693		page_counter_charge(&memcg->memsw, nr_pages);
   2694
   2695	return 0;
   2696
   2697done_restock:
   2698	if (batch > nr_pages)
   2699		refill_stock(memcg, batch - nr_pages);
   2700
   2701	/*
   2702	 * If the hierarchy is above the normal consumption range, schedule
   2703	 * reclaim on returning to userland.  We can perform reclaim here
   2704	 * if __GFP_RECLAIM but let's always punt for simplicity and so that
   2705	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
   2706	 * not recorded as it most likely matches current's and won't
   2707	 * change in the meantime.  As high limit is checked again before
   2708	 * reclaim, the cost of mismatch is negligible.
   2709	 */
   2710	do {
   2711		bool mem_high, swap_high;
   2712
   2713		mem_high = page_counter_read(&memcg->memory) >
   2714			READ_ONCE(memcg->memory.high);
   2715		swap_high = page_counter_read(&memcg->swap) >
   2716			READ_ONCE(memcg->swap.high);
   2717
   2718		/* Don't bother a random interrupted task */
   2719		if (!in_task()) {
   2720			if (mem_high) {
   2721				schedule_work(&memcg->high_work);
   2722				break;
   2723			}
   2724			continue;
   2725		}
   2726
   2727		if (mem_high || swap_high) {
   2728			/*
   2729			 * The allocating tasks in this cgroup will need to do
   2730			 * reclaim or be throttled to prevent further growth
   2731			 * of the memory or swap footprints.
   2732			 *
   2733			 * Target some best-effort fairness between the tasks,
   2734			 * and distribute reclaim work and delay penalties
   2735			 * based on how much each task is actually allocating.
   2736			 */
   2737			current->memcg_nr_pages_over_high += batch;
   2738			set_notify_resume(current);
   2739			break;
   2740		}
   2741	} while ((memcg = parent_mem_cgroup(memcg)));
   2742
   2743	if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
   2744	    !(current->flags & PF_MEMALLOC) &&
   2745	    gfpflags_allow_blocking(gfp_mask)) {
   2746		mem_cgroup_handle_over_high();
   2747	}
   2748	return 0;
   2749}
   2750
   2751static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
   2752			     unsigned int nr_pages)
   2753{
   2754	if (mem_cgroup_is_root(memcg))
   2755		return 0;
   2756
   2757	return try_charge_memcg(memcg, gfp_mask, nr_pages);
   2758}
   2759
   2760static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
   2761{
   2762	if (mem_cgroup_is_root(memcg))
   2763		return;
   2764
   2765	page_counter_uncharge(&memcg->memory, nr_pages);
   2766	if (do_memsw_account())
   2767		page_counter_uncharge(&memcg->memsw, nr_pages);
   2768}
   2769
   2770static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
   2771{
   2772	VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
   2773	/*
   2774	 * Any of the following ensures page's memcg stability:
   2775	 *
   2776	 * - the page lock
   2777	 * - LRU isolation
   2778	 * - lock_page_memcg()
   2779	 * - exclusive reference
   2780	 */
   2781	folio->memcg_data = (unsigned long)memcg;
   2782}
   2783
   2784#ifdef CONFIG_MEMCG_KMEM
   2785/*
   2786 * The allocated objcg pointers array is not accounted directly.
   2787 * Moreover, it should not come from DMA buffer and is not readily
   2788 * reclaimable. So those GFP bits should be masked off.
   2789 */
   2790#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
   2791
   2792/*
   2793 * mod_objcg_mlstate() may be called with irq enabled, so
   2794 * mod_memcg_lruvec_state() should be used.
   2795 */
   2796static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
   2797				     struct pglist_data *pgdat,
   2798				     enum node_stat_item idx, int nr)
   2799{
   2800	struct mem_cgroup *memcg;
   2801	struct lruvec *lruvec;
   2802
   2803	rcu_read_lock();
   2804	memcg = obj_cgroup_memcg(objcg);
   2805	lruvec = mem_cgroup_lruvec(memcg, pgdat);
   2806	mod_memcg_lruvec_state(lruvec, idx, nr);
   2807	rcu_read_unlock();
   2808}
   2809
   2810int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
   2811				 gfp_t gfp, bool new_slab)
   2812{
   2813	unsigned int objects = objs_per_slab(s, slab);
   2814	unsigned long memcg_data;
   2815	void *vec;
   2816
   2817	gfp &= ~OBJCGS_CLEAR_MASK;
   2818	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
   2819			   slab_nid(slab));
   2820	if (!vec)
   2821		return -ENOMEM;
   2822
   2823	memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
   2824	if (new_slab) {
   2825		/*
   2826		 * If the slab is brand new and nobody can yet access its
   2827		 * memcg_data, no synchronization is required and memcg_data can
   2828		 * be simply assigned.
   2829		 */
   2830		slab->memcg_data = memcg_data;
   2831	} else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
   2832		/*
   2833		 * If the slab is already in use, somebody can allocate and
   2834		 * assign obj_cgroups in parallel. In this case the existing
   2835		 * objcg vector should be reused.
   2836		 */
   2837		kfree(vec);
   2838		return 0;
   2839	}
   2840
   2841	kmemleak_not_leak(vec);
   2842	return 0;
   2843}
   2844
   2845/*
   2846 * Returns a pointer to the memory cgroup to which the kernel object is charged.
   2847 *
   2848 * A passed kernel object can be a slab object or a generic kernel page, so
   2849 * different mechanisms for getting the memory cgroup pointer should be used.
   2850 * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
   2851 * can not know for sure how the kernel object is implemented.
   2852 * mem_cgroup_from_obj() can be safely used in such cases.
   2853 *
   2854 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
   2855 * cgroup_mutex, etc.
   2856 */
   2857struct mem_cgroup *mem_cgroup_from_obj(void *p)
   2858{
   2859	struct folio *folio;
   2860
   2861	if (mem_cgroup_disabled())
   2862		return NULL;
   2863
   2864	folio = virt_to_folio(p);
   2865
   2866	/*
   2867	 * Slab objects are accounted individually, not per-page.
   2868	 * Memcg membership data for each individual object is saved in
   2869	 * slab->memcg_data.
   2870	 */
   2871	if (folio_test_slab(folio)) {
   2872		struct obj_cgroup **objcgs;
   2873		struct slab *slab;
   2874		unsigned int off;
   2875
   2876		slab = folio_slab(folio);
   2877		objcgs = slab_objcgs(slab);
   2878		if (!objcgs)
   2879			return NULL;
   2880
   2881		off = obj_to_index(slab->slab_cache, slab, p);
   2882		if (objcgs[off])
   2883			return obj_cgroup_memcg(objcgs[off]);
   2884
   2885		return NULL;
   2886	}
   2887
   2888	/*
   2889	 * page_memcg_check() is used here, because in theory we can encounter
   2890	 * a folio where the slab flag has been cleared already, but
   2891	 * slab->memcg_data has not been freed yet
   2892	 * page_memcg_check(page) will guarantee that a proper memory
   2893	 * cgroup pointer or NULL will be returned.
   2894	 */
   2895	return page_memcg_check(folio_page(folio, 0));
   2896}
   2897
   2898static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
   2899{
   2900	struct obj_cgroup *objcg = NULL;
   2901
   2902	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
   2903		objcg = rcu_dereference(memcg->objcg);
   2904		if (objcg && obj_cgroup_tryget(objcg))
   2905			break;
   2906		objcg = NULL;
   2907	}
   2908	return objcg;
   2909}
   2910
   2911__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
   2912{
   2913	struct obj_cgroup *objcg = NULL;
   2914	struct mem_cgroup *memcg;
   2915
   2916	if (memcg_kmem_bypass())
   2917		return NULL;
   2918
   2919	rcu_read_lock();
   2920	if (unlikely(active_memcg()))
   2921		memcg = active_memcg();
   2922	else
   2923		memcg = mem_cgroup_from_task(current);
   2924	objcg = __get_obj_cgroup_from_memcg(memcg);
   2925	rcu_read_unlock();
   2926	return objcg;
   2927}
   2928
   2929struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
   2930{
   2931	struct obj_cgroup *objcg;
   2932
   2933	if (!memcg_kmem_enabled() || memcg_kmem_bypass())
   2934		return NULL;
   2935
   2936	if (PageMemcgKmem(page)) {
   2937		objcg = __folio_objcg(page_folio(page));
   2938		obj_cgroup_get(objcg);
   2939	} else {
   2940		struct mem_cgroup *memcg;
   2941
   2942		rcu_read_lock();
   2943		memcg = __folio_memcg(page_folio(page));
   2944		if (memcg)
   2945			objcg = __get_obj_cgroup_from_memcg(memcg);
   2946		else
   2947			objcg = NULL;
   2948		rcu_read_unlock();
   2949	}
   2950	return objcg;
   2951}
   2952
   2953static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
   2954{
   2955	mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
   2956	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
   2957		if (nr_pages > 0)
   2958			page_counter_charge(&memcg->kmem, nr_pages);
   2959		else
   2960			page_counter_uncharge(&memcg->kmem, -nr_pages);
   2961	}
   2962}
   2963
   2964
   2965/*
   2966 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
   2967 * @objcg: object cgroup to uncharge
   2968 * @nr_pages: number of pages to uncharge
   2969 */
   2970static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
   2971				      unsigned int nr_pages)
   2972{
   2973	struct mem_cgroup *memcg;
   2974
   2975	memcg = get_mem_cgroup_from_objcg(objcg);
   2976
   2977	memcg_account_kmem(memcg, -nr_pages);
   2978	refill_stock(memcg, nr_pages);
   2979
   2980	css_put(&memcg->css);
   2981}
   2982
   2983/*
   2984 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
   2985 * @objcg: object cgroup to charge
   2986 * @gfp: reclaim mode
   2987 * @nr_pages: number of pages to charge
   2988 *
   2989 * Returns 0 on success, an error code on failure.
   2990 */
   2991static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
   2992				   unsigned int nr_pages)
   2993{
   2994	struct mem_cgroup *memcg;
   2995	int ret;
   2996
   2997	memcg = get_mem_cgroup_from_objcg(objcg);
   2998
   2999	ret = try_charge_memcg(memcg, gfp, nr_pages);
   3000	if (ret)
   3001		goto out;
   3002
   3003	memcg_account_kmem(memcg, nr_pages);
   3004out:
   3005	css_put(&memcg->css);
   3006
   3007	return ret;
   3008}
   3009
   3010/**
   3011 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
   3012 * @page: page to charge
   3013 * @gfp: reclaim mode
   3014 * @order: allocation order
   3015 *
   3016 * Returns 0 on success, an error code on failure.
   3017 */
   3018int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
   3019{
   3020	struct obj_cgroup *objcg;
   3021	int ret = 0;
   3022
   3023	objcg = get_obj_cgroup_from_current();
   3024	if (objcg) {
   3025		ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
   3026		if (!ret) {
   3027			page->memcg_data = (unsigned long)objcg |
   3028				MEMCG_DATA_KMEM;
   3029			return 0;
   3030		}
   3031		obj_cgroup_put(objcg);
   3032	}
   3033	return ret;
   3034}
   3035
   3036/**
   3037 * __memcg_kmem_uncharge_page: uncharge a kmem page
   3038 * @page: page to uncharge
   3039 * @order: allocation order
   3040 */
   3041void __memcg_kmem_uncharge_page(struct page *page, int order)
   3042{
   3043	struct folio *folio = page_folio(page);
   3044	struct obj_cgroup *objcg;
   3045	unsigned int nr_pages = 1 << order;
   3046
   3047	if (!folio_memcg_kmem(folio))
   3048		return;
   3049
   3050	objcg = __folio_objcg(folio);
   3051	obj_cgroup_uncharge_pages(objcg, nr_pages);
   3052	folio->memcg_data = 0;
   3053	obj_cgroup_put(objcg);
   3054}
   3055
   3056void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
   3057		     enum node_stat_item idx, int nr)
   3058{
   3059	struct memcg_stock_pcp *stock;
   3060	struct obj_cgroup *old = NULL;
   3061	unsigned long flags;
   3062	int *bytes;
   3063
   3064	local_lock_irqsave(&memcg_stock.stock_lock, flags);
   3065	stock = this_cpu_ptr(&memcg_stock);
   3066
   3067	/*
   3068	 * Save vmstat data in stock and skip vmstat array update unless
   3069	 * accumulating over a page of vmstat data or when pgdat or idx
   3070	 * changes.
   3071	 */
   3072	if (stock->cached_objcg != objcg) {
   3073		old = drain_obj_stock(stock);
   3074		obj_cgroup_get(objcg);
   3075		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
   3076				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
   3077		stock->cached_objcg = objcg;
   3078		stock->cached_pgdat = pgdat;
   3079	} else if (stock->cached_pgdat != pgdat) {
   3080		/* Flush the existing cached vmstat data */
   3081		struct pglist_data *oldpg = stock->cached_pgdat;
   3082
   3083		if (stock->nr_slab_reclaimable_b) {
   3084			mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
   3085					  stock->nr_slab_reclaimable_b);
   3086			stock->nr_slab_reclaimable_b = 0;
   3087		}
   3088		if (stock->nr_slab_unreclaimable_b) {
   3089			mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
   3090					  stock->nr_slab_unreclaimable_b);
   3091			stock->nr_slab_unreclaimable_b = 0;
   3092		}
   3093		stock->cached_pgdat = pgdat;
   3094	}
   3095
   3096	bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
   3097					       : &stock->nr_slab_unreclaimable_b;
   3098	/*
   3099	 * Even for large object >= PAGE_SIZE, the vmstat data will still be
   3100	 * cached locally at least once before pushing it out.
   3101	 */
   3102	if (!*bytes) {
   3103		*bytes = nr;
   3104		nr = 0;
   3105	} else {
   3106		*bytes += nr;
   3107		if (abs(*bytes) > PAGE_SIZE) {
   3108			nr = *bytes;
   3109			*bytes = 0;
   3110		} else {
   3111			nr = 0;
   3112		}
   3113	}
   3114	if (nr)
   3115		mod_objcg_mlstate(objcg, pgdat, idx, nr);
   3116
   3117	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
   3118	if (old)
   3119		obj_cgroup_put(old);
   3120}
   3121
   3122static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
   3123{
   3124	struct memcg_stock_pcp *stock;
   3125	unsigned long flags;
   3126	bool ret = false;
   3127
   3128	local_lock_irqsave(&memcg_stock.stock_lock, flags);
   3129
   3130	stock = this_cpu_ptr(&memcg_stock);
   3131	if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
   3132		stock->nr_bytes -= nr_bytes;
   3133		ret = true;
   3134	}
   3135
   3136	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
   3137
   3138	return ret;
   3139}
   3140
   3141static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
   3142{
   3143	struct obj_cgroup *old = stock->cached_objcg;
   3144
   3145	if (!old)
   3146		return NULL;
   3147
   3148	if (stock->nr_bytes) {
   3149		unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
   3150		unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
   3151
   3152		if (nr_pages) {
   3153			struct mem_cgroup *memcg;
   3154
   3155			memcg = get_mem_cgroup_from_objcg(old);
   3156
   3157			memcg_account_kmem(memcg, -nr_pages);
   3158			__refill_stock(memcg, nr_pages);
   3159
   3160			css_put(&memcg->css);
   3161		}
   3162
   3163		/*
   3164		 * The leftover is flushed to the centralized per-memcg value.
   3165		 * On the next attempt to refill obj stock it will be moved
   3166		 * to a per-cpu stock (probably, on an other CPU), see
   3167		 * refill_obj_stock().
   3168		 *
   3169		 * How often it's flushed is a trade-off between the memory
   3170		 * limit enforcement accuracy and potential CPU contention,
   3171		 * so it might be changed in the future.
   3172		 */
   3173		atomic_add(nr_bytes, &old->nr_charged_bytes);
   3174		stock->nr_bytes = 0;
   3175	}
   3176
   3177	/*
   3178	 * Flush the vmstat data in current stock
   3179	 */
   3180	if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
   3181		if (stock->nr_slab_reclaimable_b) {
   3182			mod_objcg_mlstate(old, stock->cached_pgdat,
   3183					  NR_SLAB_RECLAIMABLE_B,
   3184					  stock->nr_slab_reclaimable_b);
   3185			stock->nr_slab_reclaimable_b = 0;
   3186		}
   3187		if (stock->nr_slab_unreclaimable_b) {
   3188			mod_objcg_mlstate(old, stock->cached_pgdat,
   3189					  NR_SLAB_UNRECLAIMABLE_B,
   3190					  stock->nr_slab_unreclaimable_b);
   3191			stock->nr_slab_unreclaimable_b = 0;
   3192		}
   3193		stock->cached_pgdat = NULL;
   3194	}
   3195
   3196	stock->cached_objcg = NULL;
   3197	/*
   3198	 * The `old' objects needs to be released by the caller via
   3199	 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
   3200	 */
   3201	return old;
   3202}
   3203
   3204static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
   3205				     struct mem_cgroup *root_memcg)
   3206{
   3207	struct mem_cgroup *memcg;
   3208
   3209	if (stock->cached_objcg) {
   3210		memcg = obj_cgroup_memcg(stock->cached_objcg);
   3211		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
   3212			return true;
   3213	}
   3214
   3215	return false;
   3216}
   3217
   3218static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
   3219			     bool allow_uncharge)
   3220{
   3221	struct memcg_stock_pcp *stock;
   3222	struct obj_cgroup *old = NULL;
   3223	unsigned long flags;
   3224	unsigned int nr_pages = 0;
   3225
   3226	local_lock_irqsave(&memcg_stock.stock_lock, flags);
   3227
   3228	stock = this_cpu_ptr(&memcg_stock);
   3229	if (stock->cached_objcg != objcg) { /* reset if necessary */
   3230		old = drain_obj_stock(stock);
   3231		obj_cgroup_get(objcg);
   3232		stock->cached_objcg = objcg;
   3233		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
   3234				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
   3235		allow_uncharge = true;	/* Allow uncharge when objcg changes */
   3236	}
   3237	stock->nr_bytes += nr_bytes;
   3238
   3239	if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
   3240		nr_pages = stock->nr_bytes >> PAGE_SHIFT;
   3241		stock->nr_bytes &= (PAGE_SIZE - 1);
   3242	}
   3243
   3244	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
   3245	if (old)
   3246		obj_cgroup_put(old);
   3247
   3248	if (nr_pages)
   3249		obj_cgroup_uncharge_pages(objcg, nr_pages);
   3250}
   3251
   3252int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
   3253{
   3254	unsigned int nr_pages, nr_bytes;
   3255	int ret;
   3256
   3257	if (consume_obj_stock(objcg, size))
   3258		return 0;
   3259
   3260	/*
   3261	 * In theory, objcg->nr_charged_bytes can have enough
   3262	 * pre-charged bytes to satisfy the allocation. However,
   3263	 * flushing objcg->nr_charged_bytes requires two atomic
   3264	 * operations, and objcg->nr_charged_bytes can't be big.
   3265	 * The shared objcg->nr_charged_bytes can also become a
   3266	 * performance bottleneck if all tasks of the same memcg are
   3267	 * trying to update it. So it's better to ignore it and try
   3268	 * grab some new pages. The stock's nr_bytes will be flushed to
   3269	 * objcg->nr_charged_bytes later on when objcg changes.
   3270	 *
   3271	 * The stock's nr_bytes may contain enough pre-charged bytes
   3272	 * to allow one less page from being charged, but we can't rely
   3273	 * on the pre-charged bytes not being changed outside of
   3274	 * consume_obj_stock() or refill_obj_stock(). So ignore those
   3275	 * pre-charged bytes as well when charging pages. To avoid a
   3276	 * page uncharge right after a page charge, we set the
   3277	 * allow_uncharge flag to false when calling refill_obj_stock()
   3278	 * to temporarily allow the pre-charged bytes to exceed the page
   3279	 * size limit. The maximum reachable value of the pre-charged
   3280	 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
   3281	 * race.
   3282	 */
   3283	nr_pages = size >> PAGE_SHIFT;
   3284	nr_bytes = size & (PAGE_SIZE - 1);
   3285
   3286	if (nr_bytes)
   3287		nr_pages += 1;
   3288
   3289	ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
   3290	if (!ret && nr_bytes)
   3291		refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
   3292
   3293	return ret;
   3294}
   3295
   3296void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
   3297{
   3298	refill_obj_stock(objcg, size, true);
   3299}
   3300
   3301#endif /* CONFIG_MEMCG_KMEM */
   3302
   3303/*
   3304 * Because page_memcg(head) is not set on tails, set it now.
   3305 */
   3306void split_page_memcg(struct page *head, unsigned int nr)
   3307{
   3308	struct folio *folio = page_folio(head);
   3309	struct mem_cgroup *memcg = folio_memcg(folio);
   3310	int i;
   3311
   3312	if (mem_cgroup_disabled() || !memcg)
   3313		return;
   3314
   3315	for (i = 1; i < nr; i++)
   3316		folio_page(folio, i)->memcg_data = folio->memcg_data;
   3317
   3318	if (folio_memcg_kmem(folio))
   3319		obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
   3320	else
   3321		css_get_many(&memcg->css, nr - 1);
   3322}
   3323
   3324#ifdef CONFIG_MEMCG_SWAP
   3325/**
   3326 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
   3327 * @entry: swap entry to be moved
   3328 * @from:  mem_cgroup which the entry is moved from
   3329 * @to:  mem_cgroup which the entry is moved to
   3330 *
   3331 * It succeeds only when the swap_cgroup's record for this entry is the same
   3332 * as the mem_cgroup's id of @from.
   3333 *
   3334 * Returns 0 on success, -EINVAL on failure.
   3335 *
   3336 * The caller must have charged to @to, IOW, called page_counter_charge() about
   3337 * both res and memsw, and called css_get().
   3338 */
   3339static int mem_cgroup_move_swap_account(swp_entry_t entry,
   3340				struct mem_cgroup *from, struct mem_cgroup *to)
   3341{
   3342	unsigned short old_id, new_id;
   3343
   3344	old_id = mem_cgroup_id(from);
   3345	new_id = mem_cgroup_id(to);
   3346
   3347	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
   3348		mod_memcg_state(from, MEMCG_SWAP, -1);
   3349		mod_memcg_state(to, MEMCG_SWAP, 1);
   3350		return 0;
   3351	}
   3352	return -EINVAL;
   3353}
   3354#else
   3355static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
   3356				struct mem_cgroup *from, struct mem_cgroup *to)
   3357{
   3358	return -EINVAL;
   3359}
   3360#endif
   3361
   3362static DEFINE_MUTEX(memcg_max_mutex);
   3363
   3364static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
   3365				 unsigned long max, bool memsw)
   3366{
   3367	bool enlarge = false;
   3368	bool drained = false;
   3369	int ret;
   3370	bool limits_invariant;
   3371	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
   3372
   3373	do {
   3374		if (signal_pending(current)) {
   3375			ret = -EINTR;
   3376			break;
   3377		}
   3378
   3379		mutex_lock(&memcg_max_mutex);
   3380		/*
   3381		 * Make sure that the new limit (memsw or memory limit) doesn't
   3382		 * break our basic invariant rule memory.max <= memsw.max.
   3383		 */
   3384		limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
   3385					   max <= memcg->memsw.max;
   3386		if (!limits_invariant) {
   3387			mutex_unlock(&memcg_max_mutex);
   3388			ret = -EINVAL;
   3389			break;
   3390		}
   3391		if (max > counter->max)
   3392			enlarge = true;
   3393		ret = page_counter_set_max(counter, max);
   3394		mutex_unlock(&memcg_max_mutex);
   3395
   3396		if (!ret)
   3397			break;
   3398
   3399		if (!drained) {
   3400			drain_all_stock(memcg);
   3401			drained = true;
   3402			continue;
   3403		}
   3404
   3405		if (!try_to_free_mem_cgroup_pages(memcg, 1,
   3406					GFP_KERNEL, !memsw)) {
   3407			ret = -EBUSY;
   3408			break;
   3409		}
   3410	} while (true);
   3411
   3412	if (!ret && enlarge)
   3413		memcg_oom_recover(memcg);
   3414
   3415	return ret;
   3416}
   3417
   3418unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
   3419					    gfp_t gfp_mask,
   3420					    unsigned long *total_scanned)
   3421{
   3422	unsigned long nr_reclaimed = 0;
   3423	struct mem_cgroup_per_node *mz, *next_mz = NULL;
   3424	unsigned long reclaimed;
   3425	int loop = 0;
   3426	struct mem_cgroup_tree_per_node *mctz;
   3427	unsigned long excess;
   3428
   3429	if (order > 0)
   3430		return 0;
   3431
   3432	mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
   3433
   3434	/*
   3435	 * Do not even bother to check the largest node if the root
   3436	 * is empty. Do it lockless to prevent lock bouncing. Races
   3437	 * are acceptable as soft limit is best effort anyway.
   3438	 */
   3439	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
   3440		return 0;
   3441
   3442	/*
   3443	 * This loop can run a while, specially if mem_cgroup's continuously
   3444	 * keep exceeding their soft limit and putting the system under
   3445	 * pressure
   3446	 */
   3447	do {
   3448		if (next_mz)
   3449			mz = next_mz;
   3450		else
   3451			mz = mem_cgroup_largest_soft_limit_node(mctz);
   3452		if (!mz)
   3453			break;
   3454
   3455		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
   3456						    gfp_mask, total_scanned);
   3457		nr_reclaimed += reclaimed;
   3458		spin_lock_irq(&mctz->lock);
   3459
   3460		/*
   3461		 * If we failed to reclaim anything from this memory cgroup
   3462		 * it is time to move on to the next cgroup
   3463		 */
   3464		next_mz = NULL;
   3465		if (!reclaimed)
   3466			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
   3467
   3468		excess = soft_limit_excess(mz->memcg);
   3469		/*
   3470		 * One school of thought says that we should not add
   3471		 * back the node to the tree if reclaim returns 0.
   3472		 * But our reclaim could return 0, simply because due
   3473		 * to priority we are exposing a smaller subset of
   3474		 * memory to reclaim from. Consider this as a longer
   3475		 * term TODO.
   3476		 */
   3477		/* If excess == 0, no tree ops */
   3478		__mem_cgroup_insert_exceeded(mz, mctz, excess);
   3479		spin_unlock_irq(&mctz->lock);
   3480		css_put(&mz->memcg->css);
   3481		loop++;
   3482		/*
   3483		 * Could not reclaim anything and there are no more
   3484		 * mem cgroups to try or we seem to be looping without
   3485		 * reclaiming anything.
   3486		 */
   3487		if (!nr_reclaimed &&
   3488			(next_mz == NULL ||
   3489			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
   3490			break;
   3491	} while (!nr_reclaimed);
   3492	if (next_mz)
   3493		css_put(&next_mz->memcg->css);
   3494	return nr_reclaimed;
   3495}
   3496
   3497/*
   3498 * Reclaims as many pages from the given memcg as possible.
   3499 *
   3500 * Caller is responsible for holding css reference for memcg.
   3501 */
   3502static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
   3503{
   3504	int nr_retries = MAX_RECLAIM_RETRIES;
   3505
   3506	/* we call try-to-free pages for make this cgroup empty */
   3507	lru_add_drain_all();
   3508
   3509	drain_all_stock(memcg);
   3510
   3511	/* try to free all pages in this cgroup */
   3512	while (nr_retries && page_counter_read(&memcg->memory)) {
   3513		if (signal_pending(current))
   3514			return -EINTR;
   3515
   3516		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
   3517			nr_retries--;
   3518	}
   3519
   3520	return 0;
   3521}
   3522
   3523static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
   3524					    char *buf, size_t nbytes,
   3525					    loff_t off)
   3526{
   3527	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   3528
   3529	if (mem_cgroup_is_root(memcg))
   3530		return -EINVAL;
   3531	return mem_cgroup_force_empty(memcg) ?: nbytes;
   3532}
   3533
   3534static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
   3535				     struct cftype *cft)
   3536{
   3537	return 1;
   3538}
   3539
   3540static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
   3541				      struct cftype *cft, u64 val)
   3542{
   3543	if (val == 1)
   3544		return 0;
   3545
   3546	pr_warn_once("Non-hierarchical mode is deprecated. "
   3547		     "Please report your usecase to linux-mm@kvack.org if you "
   3548		     "depend on this functionality.\n");
   3549
   3550	return -EINVAL;
   3551}
   3552
   3553static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
   3554{
   3555	unsigned long val;
   3556
   3557	if (mem_cgroup_is_root(memcg)) {
   3558		mem_cgroup_flush_stats();
   3559		val = memcg_page_state(memcg, NR_FILE_PAGES) +
   3560			memcg_page_state(memcg, NR_ANON_MAPPED);
   3561		if (swap)
   3562			val += memcg_page_state(memcg, MEMCG_SWAP);
   3563	} else {
   3564		if (!swap)
   3565			val = page_counter_read(&memcg->memory);
   3566		else
   3567			val = page_counter_read(&memcg->memsw);
   3568	}
   3569	return val;
   3570}
   3571
   3572enum {
   3573	RES_USAGE,
   3574	RES_LIMIT,
   3575	RES_MAX_USAGE,
   3576	RES_FAILCNT,
   3577	RES_SOFT_LIMIT,
   3578};
   3579
   3580static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
   3581			       struct cftype *cft)
   3582{
   3583	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   3584	struct page_counter *counter;
   3585
   3586	switch (MEMFILE_TYPE(cft->private)) {
   3587	case _MEM:
   3588		counter = &memcg->memory;
   3589		break;
   3590	case _MEMSWAP:
   3591		counter = &memcg->memsw;
   3592		break;
   3593	case _KMEM:
   3594		counter = &memcg->kmem;
   3595		break;
   3596	case _TCP:
   3597		counter = &memcg->tcpmem;
   3598		break;
   3599	default:
   3600		BUG();
   3601	}
   3602
   3603	switch (MEMFILE_ATTR(cft->private)) {
   3604	case RES_USAGE:
   3605		if (counter == &memcg->memory)
   3606			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
   3607		if (counter == &memcg->memsw)
   3608			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
   3609		return (u64)page_counter_read(counter) * PAGE_SIZE;
   3610	case RES_LIMIT:
   3611		return (u64)counter->max * PAGE_SIZE;
   3612	case RES_MAX_USAGE:
   3613		return (u64)counter->watermark * PAGE_SIZE;
   3614	case RES_FAILCNT:
   3615		return counter->failcnt;
   3616	case RES_SOFT_LIMIT:
   3617		return (u64)memcg->soft_limit * PAGE_SIZE;
   3618	default:
   3619		BUG();
   3620	}
   3621}
   3622
   3623#ifdef CONFIG_MEMCG_KMEM
   3624static int memcg_online_kmem(struct mem_cgroup *memcg)
   3625{
   3626	struct obj_cgroup *objcg;
   3627
   3628	if (cgroup_memory_nokmem)
   3629		return 0;
   3630
   3631	if (unlikely(mem_cgroup_is_root(memcg)))
   3632		return 0;
   3633
   3634	objcg = obj_cgroup_alloc();
   3635	if (!objcg)
   3636		return -ENOMEM;
   3637
   3638	objcg->memcg = memcg;
   3639	rcu_assign_pointer(memcg->objcg, objcg);
   3640
   3641	static_branch_enable(&memcg_kmem_enabled_key);
   3642
   3643	memcg->kmemcg_id = memcg->id.id;
   3644
   3645	return 0;
   3646}
   3647
   3648static void memcg_offline_kmem(struct mem_cgroup *memcg)
   3649{
   3650	struct mem_cgroup *parent;
   3651
   3652	if (cgroup_memory_nokmem)
   3653		return;
   3654
   3655	if (unlikely(mem_cgroup_is_root(memcg)))
   3656		return;
   3657
   3658	parent = parent_mem_cgroup(memcg);
   3659	if (!parent)
   3660		parent = root_mem_cgroup;
   3661
   3662	memcg_reparent_objcgs(memcg, parent);
   3663
   3664	/*
   3665	 * After we have finished memcg_reparent_objcgs(), all list_lrus
   3666	 * corresponding to this cgroup are guaranteed to remain empty.
   3667	 * The ordering is imposed by list_lru_node->lock taken by
   3668	 * memcg_reparent_list_lrus().
   3669	 */
   3670	memcg_reparent_list_lrus(memcg, parent);
   3671}
   3672#else
   3673static int memcg_online_kmem(struct mem_cgroup *memcg)
   3674{
   3675	return 0;
   3676}
   3677static void memcg_offline_kmem(struct mem_cgroup *memcg)
   3678{
   3679}
   3680#endif /* CONFIG_MEMCG_KMEM */
   3681
   3682static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
   3683{
   3684	int ret;
   3685
   3686	mutex_lock(&memcg_max_mutex);
   3687
   3688	ret = page_counter_set_max(&memcg->tcpmem, max);
   3689	if (ret)
   3690		goto out;
   3691
   3692	if (!memcg->tcpmem_active) {
   3693		/*
   3694		 * The active flag needs to be written after the static_key
   3695		 * update. This is what guarantees that the socket activation
   3696		 * function is the last one to run. See mem_cgroup_sk_alloc()
   3697		 * for details, and note that we don't mark any socket as
   3698		 * belonging to this memcg until that flag is up.
   3699		 *
   3700		 * We need to do this, because static_keys will span multiple
   3701		 * sites, but we can't control their order. If we mark a socket
   3702		 * as accounted, but the accounting functions are not patched in
   3703		 * yet, we'll lose accounting.
   3704		 *
   3705		 * We never race with the readers in mem_cgroup_sk_alloc(),
   3706		 * because when this value change, the code to process it is not
   3707		 * patched in yet.
   3708		 */
   3709		static_branch_inc(&memcg_sockets_enabled_key);
   3710		memcg->tcpmem_active = true;
   3711	}
   3712out:
   3713	mutex_unlock(&memcg_max_mutex);
   3714	return ret;
   3715}
   3716
   3717/*
   3718 * The user of this function is...
   3719 * RES_LIMIT.
   3720 */
   3721static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
   3722				char *buf, size_t nbytes, loff_t off)
   3723{
   3724	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   3725	unsigned long nr_pages;
   3726	int ret;
   3727
   3728	buf = strstrip(buf);
   3729	ret = page_counter_memparse(buf, "-1", &nr_pages);
   3730	if (ret)
   3731		return ret;
   3732
   3733	switch (MEMFILE_ATTR(of_cft(of)->private)) {
   3734	case RES_LIMIT:
   3735		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
   3736			ret = -EINVAL;
   3737			break;
   3738		}
   3739		switch (MEMFILE_TYPE(of_cft(of)->private)) {
   3740		case _MEM:
   3741			ret = mem_cgroup_resize_max(memcg, nr_pages, false);
   3742			break;
   3743		case _MEMSWAP:
   3744			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
   3745			break;
   3746		case _KMEM:
   3747			/* kmem.limit_in_bytes is deprecated. */
   3748			ret = -EOPNOTSUPP;
   3749			break;
   3750		case _TCP:
   3751			ret = memcg_update_tcp_max(memcg, nr_pages);
   3752			break;
   3753		}
   3754		break;
   3755	case RES_SOFT_LIMIT:
   3756		if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
   3757			ret = -EOPNOTSUPP;
   3758		} else {
   3759			memcg->soft_limit = nr_pages;
   3760			ret = 0;
   3761		}
   3762		break;
   3763	}
   3764	return ret ?: nbytes;
   3765}
   3766
   3767static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
   3768				size_t nbytes, loff_t off)
   3769{
   3770	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   3771	struct page_counter *counter;
   3772
   3773	switch (MEMFILE_TYPE(of_cft(of)->private)) {
   3774	case _MEM:
   3775		counter = &memcg->memory;
   3776		break;
   3777	case _MEMSWAP:
   3778		counter = &memcg->memsw;
   3779		break;
   3780	case _KMEM:
   3781		counter = &memcg->kmem;
   3782		break;
   3783	case _TCP:
   3784		counter = &memcg->tcpmem;
   3785		break;
   3786	default:
   3787		BUG();
   3788	}
   3789
   3790	switch (MEMFILE_ATTR(of_cft(of)->private)) {
   3791	case RES_MAX_USAGE:
   3792		page_counter_reset_watermark(counter);
   3793		break;
   3794	case RES_FAILCNT:
   3795		counter->failcnt = 0;
   3796		break;
   3797	default:
   3798		BUG();
   3799	}
   3800
   3801	return nbytes;
   3802}
   3803
   3804static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
   3805					struct cftype *cft)
   3806{
   3807	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
   3808}
   3809
   3810#ifdef CONFIG_MMU
   3811static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
   3812					struct cftype *cft, u64 val)
   3813{
   3814	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   3815
   3816	if (val & ~MOVE_MASK)
   3817		return -EINVAL;
   3818
   3819	/*
   3820	 * No kind of locking is needed in here, because ->can_attach() will
   3821	 * check this value once in the beginning of the process, and then carry
   3822	 * on with stale data. This means that changes to this value will only
   3823	 * affect task migrations starting after the change.
   3824	 */
   3825	memcg->move_charge_at_immigrate = val;
   3826	return 0;
   3827}
   3828#else
   3829static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
   3830					struct cftype *cft, u64 val)
   3831{
   3832	return -ENOSYS;
   3833}
   3834#endif
   3835
   3836#ifdef CONFIG_NUMA
   3837
   3838#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
   3839#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
   3840#define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
   3841
   3842static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
   3843				int nid, unsigned int lru_mask, bool tree)
   3844{
   3845	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
   3846	unsigned long nr = 0;
   3847	enum lru_list lru;
   3848
   3849	VM_BUG_ON((unsigned)nid >= nr_node_ids);
   3850
   3851	for_each_lru(lru) {
   3852		if (!(BIT(lru) & lru_mask))
   3853			continue;
   3854		if (tree)
   3855			nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
   3856		else
   3857			nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
   3858	}
   3859	return nr;
   3860}
   3861
   3862static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
   3863					     unsigned int lru_mask,
   3864					     bool tree)
   3865{
   3866	unsigned long nr = 0;
   3867	enum lru_list lru;
   3868
   3869	for_each_lru(lru) {
   3870		if (!(BIT(lru) & lru_mask))
   3871			continue;
   3872		if (tree)
   3873			nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
   3874		else
   3875			nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
   3876	}
   3877	return nr;
   3878}
   3879
   3880static int memcg_numa_stat_show(struct seq_file *m, void *v)
   3881{
   3882	struct numa_stat {
   3883		const char *name;
   3884		unsigned int lru_mask;
   3885	};
   3886
   3887	static const struct numa_stat stats[] = {
   3888		{ "total", LRU_ALL },
   3889		{ "file", LRU_ALL_FILE },
   3890		{ "anon", LRU_ALL_ANON },
   3891		{ "unevictable", BIT(LRU_UNEVICTABLE) },
   3892	};
   3893	const struct numa_stat *stat;
   3894	int nid;
   3895	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
   3896
   3897	mem_cgroup_flush_stats();
   3898
   3899	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
   3900		seq_printf(m, "%s=%lu", stat->name,
   3901			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
   3902						   false));
   3903		for_each_node_state(nid, N_MEMORY)
   3904			seq_printf(m, " N%d=%lu", nid,
   3905				   mem_cgroup_node_nr_lru_pages(memcg, nid,
   3906							stat->lru_mask, false));
   3907		seq_putc(m, '\n');
   3908	}
   3909
   3910	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
   3911
   3912		seq_printf(m, "hierarchical_%s=%lu", stat->name,
   3913			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
   3914						   true));
   3915		for_each_node_state(nid, N_MEMORY)
   3916			seq_printf(m, " N%d=%lu", nid,
   3917				   mem_cgroup_node_nr_lru_pages(memcg, nid,
   3918							stat->lru_mask, true));
   3919		seq_putc(m, '\n');
   3920	}
   3921
   3922	return 0;
   3923}
   3924#endif /* CONFIG_NUMA */
   3925
   3926static const unsigned int memcg1_stats[] = {
   3927	NR_FILE_PAGES,
   3928	NR_ANON_MAPPED,
   3929#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   3930	NR_ANON_THPS,
   3931#endif
   3932	NR_SHMEM,
   3933	NR_FILE_MAPPED,
   3934	NR_FILE_DIRTY,
   3935	NR_WRITEBACK,
   3936	MEMCG_SWAP,
   3937};
   3938
   3939static const char *const memcg1_stat_names[] = {
   3940	"cache",
   3941	"rss",
   3942#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   3943	"rss_huge",
   3944#endif
   3945	"shmem",
   3946	"mapped_file",
   3947	"dirty",
   3948	"writeback",
   3949	"swap",
   3950};
   3951
   3952/* Universal VM events cgroup1 shows, original sort order */
   3953static const unsigned int memcg1_events[] = {
   3954	PGPGIN,
   3955	PGPGOUT,
   3956	PGFAULT,
   3957	PGMAJFAULT,
   3958};
   3959
   3960static int memcg_stat_show(struct seq_file *m, void *v)
   3961{
   3962	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
   3963	unsigned long memory, memsw;
   3964	struct mem_cgroup *mi;
   3965	unsigned int i;
   3966
   3967	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
   3968
   3969	mem_cgroup_flush_stats();
   3970
   3971	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
   3972		unsigned long nr;
   3973
   3974		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
   3975			continue;
   3976		nr = memcg_page_state_local(memcg, memcg1_stats[i]);
   3977		seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
   3978	}
   3979
   3980	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
   3981		seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
   3982			   memcg_events_local(memcg, memcg1_events[i]));
   3983
   3984	for (i = 0; i < NR_LRU_LISTS; i++)
   3985		seq_printf(m, "%s %lu\n", lru_list_name(i),
   3986			   memcg_page_state_local(memcg, NR_LRU_BASE + i) *
   3987			   PAGE_SIZE);
   3988
   3989	/* Hierarchical information */
   3990	memory = memsw = PAGE_COUNTER_MAX;
   3991	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
   3992		memory = min(memory, READ_ONCE(mi->memory.max));
   3993		memsw = min(memsw, READ_ONCE(mi->memsw.max));
   3994	}
   3995	seq_printf(m, "hierarchical_memory_limit %llu\n",
   3996		   (u64)memory * PAGE_SIZE);
   3997	if (do_memsw_account())
   3998		seq_printf(m, "hierarchical_memsw_limit %llu\n",
   3999			   (u64)memsw * PAGE_SIZE);
   4000
   4001	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
   4002		unsigned long nr;
   4003
   4004		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
   4005			continue;
   4006		nr = memcg_page_state(memcg, memcg1_stats[i]);
   4007		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
   4008						(u64)nr * PAGE_SIZE);
   4009	}
   4010
   4011	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
   4012		seq_printf(m, "total_%s %llu\n",
   4013			   vm_event_name(memcg1_events[i]),
   4014			   (u64)memcg_events(memcg, memcg1_events[i]));
   4015
   4016	for (i = 0; i < NR_LRU_LISTS; i++)
   4017		seq_printf(m, "total_%s %llu\n", lru_list_name(i),
   4018			   (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
   4019			   PAGE_SIZE);
   4020
   4021#ifdef CONFIG_DEBUG_VM
   4022	{
   4023		pg_data_t *pgdat;
   4024		struct mem_cgroup_per_node *mz;
   4025		unsigned long anon_cost = 0;
   4026		unsigned long file_cost = 0;
   4027
   4028		for_each_online_pgdat(pgdat) {
   4029			mz = memcg->nodeinfo[pgdat->node_id];
   4030
   4031			anon_cost += mz->lruvec.anon_cost;
   4032			file_cost += mz->lruvec.file_cost;
   4033		}
   4034		seq_printf(m, "anon_cost %lu\n", anon_cost);
   4035		seq_printf(m, "file_cost %lu\n", file_cost);
   4036	}
   4037#endif
   4038
   4039	return 0;
   4040}
   4041
   4042static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
   4043				      struct cftype *cft)
   4044{
   4045	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   4046
   4047	return mem_cgroup_swappiness(memcg);
   4048}
   4049
   4050static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
   4051				       struct cftype *cft, u64 val)
   4052{
   4053	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   4054
   4055	if (val > 200)
   4056		return -EINVAL;
   4057
   4058	if (!mem_cgroup_is_root(memcg))
   4059		memcg->swappiness = val;
   4060	else
   4061		vm_swappiness = val;
   4062
   4063	return 0;
   4064}
   4065
   4066static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
   4067{
   4068	struct mem_cgroup_threshold_ary *t;
   4069	unsigned long usage;
   4070	int i;
   4071
   4072	rcu_read_lock();
   4073	if (!swap)
   4074		t = rcu_dereference(memcg->thresholds.primary);
   4075	else
   4076		t = rcu_dereference(memcg->memsw_thresholds.primary);
   4077
   4078	if (!t)
   4079		goto unlock;
   4080
   4081	usage = mem_cgroup_usage(memcg, swap);
   4082
   4083	/*
   4084	 * current_threshold points to threshold just below or equal to usage.
   4085	 * If it's not true, a threshold was crossed after last
   4086	 * call of __mem_cgroup_threshold().
   4087	 */
   4088	i = t->current_threshold;
   4089
   4090	/*
   4091	 * Iterate backward over array of thresholds starting from
   4092	 * current_threshold and check if a threshold is crossed.
   4093	 * If none of thresholds below usage is crossed, we read
   4094	 * only one element of the array here.
   4095	 */
   4096	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
   4097		eventfd_signal(t->entries[i].eventfd, 1);
   4098
   4099	/* i = current_threshold + 1 */
   4100	i++;
   4101
   4102	/*
   4103	 * Iterate forward over array of thresholds starting from
   4104	 * current_threshold+1 and check if a threshold is crossed.
   4105	 * If none of thresholds above usage is crossed, we read
   4106	 * only one element of the array here.
   4107	 */
   4108	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
   4109		eventfd_signal(t->entries[i].eventfd, 1);
   4110
   4111	/* Update current_threshold */
   4112	t->current_threshold = i - 1;
   4113unlock:
   4114	rcu_read_unlock();
   4115}
   4116
   4117static void mem_cgroup_threshold(struct mem_cgroup *memcg)
   4118{
   4119	while (memcg) {
   4120		__mem_cgroup_threshold(memcg, false);
   4121		if (do_memsw_account())
   4122			__mem_cgroup_threshold(memcg, true);
   4123
   4124		memcg = parent_mem_cgroup(memcg);
   4125	}
   4126}
   4127
   4128static int compare_thresholds(const void *a, const void *b)
   4129{
   4130	const struct mem_cgroup_threshold *_a = a;
   4131	const struct mem_cgroup_threshold *_b = b;
   4132
   4133	if (_a->threshold > _b->threshold)
   4134		return 1;
   4135
   4136	if (_a->threshold < _b->threshold)
   4137		return -1;
   4138
   4139	return 0;
   4140}
   4141
   4142static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
   4143{
   4144	struct mem_cgroup_eventfd_list *ev;
   4145
   4146	spin_lock(&memcg_oom_lock);
   4147
   4148	list_for_each_entry(ev, &memcg->oom_notify, list)
   4149		eventfd_signal(ev->eventfd, 1);
   4150
   4151	spin_unlock(&memcg_oom_lock);
   4152	return 0;
   4153}
   4154
   4155static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
   4156{
   4157	struct mem_cgroup *iter;
   4158
   4159	for_each_mem_cgroup_tree(iter, memcg)
   4160		mem_cgroup_oom_notify_cb(iter);
   4161}
   4162
   4163static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
   4164	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
   4165{
   4166	struct mem_cgroup_thresholds *thresholds;
   4167	struct mem_cgroup_threshold_ary *new;
   4168	unsigned long threshold;
   4169	unsigned long usage;
   4170	int i, size, ret;
   4171
   4172	ret = page_counter_memparse(args, "-1", &threshold);
   4173	if (ret)
   4174		return ret;
   4175
   4176	mutex_lock(&memcg->thresholds_lock);
   4177
   4178	if (type == _MEM) {
   4179		thresholds = &memcg->thresholds;
   4180		usage = mem_cgroup_usage(memcg, false);
   4181	} else if (type == _MEMSWAP) {
   4182		thresholds = &memcg->memsw_thresholds;
   4183		usage = mem_cgroup_usage(memcg, true);
   4184	} else
   4185		BUG();
   4186
   4187	/* Check if a threshold crossed before adding a new one */
   4188	if (thresholds->primary)
   4189		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
   4190
   4191	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
   4192
   4193	/* Allocate memory for new array of thresholds */
   4194	new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
   4195	if (!new) {
   4196		ret = -ENOMEM;
   4197		goto unlock;
   4198	}
   4199	new->size = size;
   4200
   4201	/* Copy thresholds (if any) to new array */
   4202	if (thresholds->primary)
   4203		memcpy(new->entries, thresholds->primary->entries,
   4204		       flex_array_size(new, entries, size - 1));
   4205
   4206	/* Add new threshold */
   4207	new->entries[size - 1].eventfd = eventfd;
   4208	new->entries[size - 1].threshold = threshold;
   4209
   4210	/* Sort thresholds. Registering of new threshold isn't time-critical */
   4211	sort(new->entries, size, sizeof(*new->entries),
   4212			compare_thresholds, NULL);
   4213
   4214	/* Find current threshold */
   4215	new->current_threshold = -1;
   4216	for (i = 0; i < size; i++) {
   4217		if (new->entries[i].threshold <= usage) {
   4218			/*
   4219			 * new->current_threshold will not be used until
   4220			 * rcu_assign_pointer(), so it's safe to increment
   4221			 * it here.
   4222			 */
   4223			++new->current_threshold;
   4224		} else
   4225			break;
   4226	}
   4227
   4228	/* Free old spare buffer and save old primary buffer as spare */
   4229	kfree(thresholds->spare);
   4230	thresholds->spare = thresholds->primary;
   4231
   4232	rcu_assign_pointer(thresholds->primary, new);
   4233
   4234	/* To be sure that nobody uses thresholds */
   4235	synchronize_rcu();
   4236
   4237unlock:
   4238	mutex_unlock(&memcg->thresholds_lock);
   4239
   4240	return ret;
   4241}
   4242
   4243static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
   4244	struct eventfd_ctx *eventfd, const char *args)
   4245{
   4246	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
   4247}
   4248
   4249static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
   4250	struct eventfd_ctx *eventfd, const char *args)
   4251{
   4252	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
   4253}
   4254
   4255static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
   4256	struct eventfd_ctx *eventfd, enum res_type type)
   4257{
   4258	struct mem_cgroup_thresholds *thresholds;
   4259	struct mem_cgroup_threshold_ary *new;
   4260	unsigned long usage;
   4261	int i, j, size, entries;
   4262
   4263	mutex_lock(&memcg->thresholds_lock);
   4264
   4265	if (type == _MEM) {
   4266		thresholds = &memcg->thresholds;
   4267		usage = mem_cgroup_usage(memcg, false);
   4268	} else if (type == _MEMSWAP) {
   4269		thresholds = &memcg->memsw_thresholds;
   4270		usage = mem_cgroup_usage(memcg, true);
   4271	} else
   4272		BUG();
   4273
   4274	if (!thresholds->primary)
   4275		goto unlock;
   4276
   4277	/* Check if a threshold crossed before removing */
   4278	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
   4279
   4280	/* Calculate new number of threshold */
   4281	size = entries = 0;
   4282	for (i = 0; i < thresholds->primary->size; i++) {
   4283		if (thresholds->primary->entries[i].eventfd != eventfd)
   4284			size++;
   4285		else
   4286			entries++;
   4287	}
   4288
   4289	new = thresholds->spare;
   4290
   4291	/* If no items related to eventfd have been cleared, nothing to do */
   4292	if (!entries)
   4293		goto unlock;
   4294
   4295	/* Set thresholds array to NULL if we don't have thresholds */
   4296	if (!size) {
   4297		kfree(new);
   4298		new = NULL;
   4299		goto swap_buffers;
   4300	}
   4301
   4302	new->size = size;
   4303
   4304	/* Copy thresholds and find current threshold */
   4305	new->current_threshold = -1;
   4306	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
   4307		if (thresholds->primary->entries[i].eventfd == eventfd)
   4308			continue;
   4309
   4310		new->entries[j] = thresholds->primary->entries[i];
   4311		if (new->entries[j].threshold <= usage) {
   4312			/*
   4313			 * new->current_threshold will not be used
   4314			 * until rcu_assign_pointer(), so it's safe to increment
   4315			 * it here.
   4316			 */
   4317			++new->current_threshold;
   4318		}
   4319		j++;
   4320	}
   4321
   4322swap_buffers:
   4323	/* Swap primary and spare array */
   4324	thresholds->spare = thresholds->primary;
   4325
   4326	rcu_assign_pointer(thresholds->primary, new);
   4327
   4328	/* To be sure that nobody uses thresholds */
   4329	synchronize_rcu();
   4330
   4331	/* If all events are unregistered, free the spare array */
   4332	if (!new) {
   4333		kfree(thresholds->spare);
   4334		thresholds->spare = NULL;
   4335	}
   4336unlock:
   4337	mutex_unlock(&memcg->thresholds_lock);
   4338}
   4339
   4340static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
   4341	struct eventfd_ctx *eventfd)
   4342{
   4343	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
   4344}
   4345
   4346static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
   4347	struct eventfd_ctx *eventfd)
   4348{
   4349	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
   4350}
   4351
   4352static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
   4353	struct eventfd_ctx *eventfd, const char *args)
   4354{
   4355	struct mem_cgroup_eventfd_list *event;
   4356
   4357	event = kmalloc(sizeof(*event),	GFP_KERNEL);
   4358	if (!event)
   4359		return -ENOMEM;
   4360
   4361	spin_lock(&memcg_oom_lock);
   4362
   4363	event->eventfd = eventfd;
   4364	list_add(&event->list, &memcg->oom_notify);
   4365
   4366	/* already in OOM ? */
   4367	if (memcg->under_oom)
   4368		eventfd_signal(eventfd, 1);
   4369	spin_unlock(&memcg_oom_lock);
   4370
   4371	return 0;
   4372}
   4373
   4374static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
   4375	struct eventfd_ctx *eventfd)
   4376{
   4377	struct mem_cgroup_eventfd_list *ev, *tmp;
   4378
   4379	spin_lock(&memcg_oom_lock);
   4380
   4381	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
   4382		if (ev->eventfd == eventfd) {
   4383			list_del(&ev->list);
   4384			kfree(ev);
   4385		}
   4386	}
   4387
   4388	spin_unlock(&memcg_oom_lock);
   4389}
   4390
   4391static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
   4392{
   4393	struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
   4394
   4395	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
   4396	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
   4397	seq_printf(sf, "oom_kill %lu\n",
   4398		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
   4399	return 0;
   4400}
   4401
   4402static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
   4403	struct cftype *cft, u64 val)
   4404{
   4405	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   4406
   4407	/* cannot set to root cgroup and only 0 and 1 are allowed */
   4408	if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
   4409		return -EINVAL;
   4410
   4411	memcg->oom_kill_disable = val;
   4412	if (!val)
   4413		memcg_oom_recover(memcg);
   4414
   4415	return 0;
   4416}
   4417
   4418#ifdef CONFIG_CGROUP_WRITEBACK
   4419
   4420#include <trace/events/writeback.h>
   4421
   4422static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
   4423{
   4424	return wb_domain_init(&memcg->cgwb_domain, gfp);
   4425}
   4426
   4427static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
   4428{
   4429	wb_domain_exit(&memcg->cgwb_domain);
   4430}
   4431
   4432static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
   4433{
   4434	wb_domain_size_changed(&memcg->cgwb_domain);
   4435}
   4436
   4437struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
   4438{
   4439	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
   4440
   4441	if (!memcg->css.parent)
   4442		return NULL;
   4443
   4444	return &memcg->cgwb_domain;
   4445}
   4446
   4447/**
   4448 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
   4449 * @wb: bdi_writeback in question
   4450 * @pfilepages: out parameter for number of file pages
   4451 * @pheadroom: out parameter for number of allocatable pages according to memcg
   4452 * @pdirty: out parameter for number of dirty pages
   4453 * @pwriteback: out parameter for number of pages under writeback
   4454 *
   4455 * Determine the numbers of file, headroom, dirty, and writeback pages in
   4456 * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
   4457 * is a bit more involved.
   4458 *
   4459 * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
   4460 * headroom is calculated as the lowest headroom of itself and the
   4461 * ancestors.  Note that this doesn't consider the actual amount of
   4462 * available memory in the system.  The caller should further cap
   4463 * *@pheadroom accordingly.
   4464 */
   4465void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
   4466			 unsigned long *pheadroom, unsigned long *pdirty,
   4467			 unsigned long *pwriteback)
   4468{
   4469	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
   4470	struct mem_cgroup *parent;
   4471
   4472	mem_cgroup_flush_stats();
   4473
   4474	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
   4475	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
   4476	*pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
   4477			memcg_page_state(memcg, NR_ACTIVE_FILE);
   4478
   4479	*pheadroom = PAGE_COUNTER_MAX;
   4480	while ((parent = parent_mem_cgroup(memcg))) {
   4481		unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
   4482					    READ_ONCE(memcg->memory.high));
   4483		unsigned long used = page_counter_read(&memcg->memory);
   4484
   4485		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
   4486		memcg = parent;
   4487	}
   4488}
   4489
   4490/*
   4491 * Foreign dirty flushing
   4492 *
   4493 * There's an inherent mismatch between memcg and writeback.  The former
   4494 * tracks ownership per-page while the latter per-inode.  This was a
   4495 * deliberate design decision because honoring per-page ownership in the
   4496 * writeback path is complicated, may lead to higher CPU and IO overheads
   4497 * and deemed unnecessary given that write-sharing an inode across
   4498 * different cgroups isn't a common use-case.
   4499 *
   4500 * Combined with inode majority-writer ownership switching, this works well
   4501 * enough in most cases but there are some pathological cases.  For
   4502 * example, let's say there are two cgroups A and B which keep writing to
   4503 * different but confined parts of the same inode.  B owns the inode and
   4504 * A's memory is limited far below B's.  A's dirty ratio can rise enough to
   4505 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
   4506 * triggering background writeback.  A will be slowed down without a way to
   4507 * make writeback of the dirty pages happen.
   4508 *
   4509 * Conditions like the above can lead to a cgroup getting repeatedly and
   4510 * severely throttled after making some progress after each
   4511 * dirty_expire_interval while the underlying IO device is almost
   4512 * completely idle.
   4513 *
   4514 * Solving this problem completely requires matching the ownership tracking
   4515 * granularities between memcg and writeback in either direction.  However,
   4516 * the more egregious behaviors can be avoided by simply remembering the
   4517 * most recent foreign dirtying events and initiating remote flushes on
   4518 * them when local writeback isn't enough to keep the memory clean enough.
   4519 *
   4520 * The following two functions implement such mechanism.  When a foreign
   4521 * page - a page whose memcg and writeback ownerships don't match - is
   4522 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
   4523 * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
   4524 * decides that the memcg needs to sleep due to high dirty ratio, it calls
   4525 * mem_cgroup_flush_foreign() which queues writeback on the recorded
   4526 * foreign bdi_writebacks which haven't expired.  Both the numbers of
   4527 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
   4528 * limited to MEMCG_CGWB_FRN_CNT.
   4529 *
   4530 * The mechanism only remembers IDs and doesn't hold any object references.
   4531 * As being wrong occasionally doesn't matter, updates and accesses to the
   4532 * records are lockless and racy.
   4533 */
   4534void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
   4535					     struct bdi_writeback *wb)
   4536{
   4537	struct mem_cgroup *memcg = folio_memcg(folio);
   4538	struct memcg_cgwb_frn *frn;
   4539	u64 now = get_jiffies_64();
   4540	u64 oldest_at = now;
   4541	int oldest = -1;
   4542	int i;
   4543
   4544	trace_track_foreign_dirty(folio, wb);
   4545
   4546	/*
   4547	 * Pick the slot to use.  If there is already a slot for @wb, keep
   4548	 * using it.  If not replace the oldest one which isn't being
   4549	 * written out.
   4550	 */
   4551	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
   4552		frn = &memcg->cgwb_frn[i];
   4553		if (frn->bdi_id == wb->bdi->id &&
   4554		    frn->memcg_id == wb->memcg_css->id)
   4555			break;
   4556		if (time_before64(frn->at, oldest_at) &&
   4557		    atomic_read(&frn->done.cnt) == 1) {
   4558			oldest = i;
   4559			oldest_at = frn->at;
   4560		}
   4561	}
   4562
   4563	if (i < MEMCG_CGWB_FRN_CNT) {
   4564		/*
   4565		 * Re-using an existing one.  Update timestamp lazily to
   4566		 * avoid making the cacheline hot.  We want them to be
   4567		 * reasonably up-to-date and significantly shorter than
   4568		 * dirty_expire_interval as that's what expires the record.
   4569		 * Use the shorter of 1s and dirty_expire_interval / 8.
   4570		 */
   4571		unsigned long update_intv =
   4572			min_t(unsigned long, HZ,
   4573			      msecs_to_jiffies(dirty_expire_interval * 10) / 8);
   4574
   4575		if (time_before64(frn->at, now - update_intv))
   4576			frn->at = now;
   4577	} else if (oldest >= 0) {
   4578		/* replace the oldest free one */
   4579		frn = &memcg->cgwb_frn[oldest];
   4580		frn->bdi_id = wb->bdi->id;
   4581		frn->memcg_id = wb->memcg_css->id;
   4582		frn->at = now;
   4583	}
   4584}
   4585
   4586/* issue foreign writeback flushes for recorded foreign dirtying events */
   4587void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
   4588{
   4589	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
   4590	unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
   4591	u64 now = jiffies_64;
   4592	int i;
   4593
   4594	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
   4595		struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
   4596
   4597		/*
   4598		 * If the record is older than dirty_expire_interval,
   4599		 * writeback on it has already started.  No need to kick it
   4600		 * off again.  Also, don't start a new one if there's
   4601		 * already one in flight.
   4602		 */
   4603		if (time_after64(frn->at, now - intv) &&
   4604		    atomic_read(&frn->done.cnt) == 1) {
   4605			frn->at = 0;
   4606			trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
   4607			cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
   4608					       WB_REASON_FOREIGN_FLUSH,
   4609					       &frn->done);
   4610		}
   4611	}
   4612}
   4613
   4614#else	/* CONFIG_CGROUP_WRITEBACK */
   4615
   4616static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
   4617{
   4618	return 0;
   4619}
   4620
   4621static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
   4622{
   4623}
   4624
   4625static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
   4626{
   4627}
   4628
   4629#endif	/* CONFIG_CGROUP_WRITEBACK */
   4630
   4631/*
   4632 * DO NOT USE IN NEW FILES.
   4633 *
   4634 * "cgroup.event_control" implementation.
   4635 *
   4636 * This is way over-engineered.  It tries to support fully configurable
   4637 * events for each user.  Such level of flexibility is completely
   4638 * unnecessary especially in the light of the planned unified hierarchy.
   4639 *
   4640 * Please deprecate this and replace with something simpler if at all
   4641 * possible.
   4642 */
   4643
   4644/*
   4645 * Unregister event and free resources.
   4646 *
   4647 * Gets called from workqueue.
   4648 */
   4649static void memcg_event_remove(struct work_struct *work)
   4650{
   4651	struct mem_cgroup_event *event =
   4652		container_of(work, struct mem_cgroup_event, remove);
   4653	struct mem_cgroup *memcg = event->memcg;
   4654
   4655	remove_wait_queue(event->wqh, &event->wait);
   4656
   4657	event->unregister_event(memcg, event->eventfd);
   4658
   4659	/* Notify userspace the event is going away. */
   4660	eventfd_signal(event->eventfd, 1);
   4661
   4662	eventfd_ctx_put(event->eventfd);
   4663	kfree(event);
   4664	css_put(&memcg->css);
   4665}
   4666
   4667/*
   4668 * Gets called on EPOLLHUP on eventfd when user closes it.
   4669 *
   4670 * Called with wqh->lock held and interrupts disabled.
   4671 */
   4672static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
   4673			    int sync, void *key)
   4674{
   4675	struct mem_cgroup_event *event =
   4676		container_of(wait, struct mem_cgroup_event, wait);
   4677	struct mem_cgroup *memcg = event->memcg;
   4678	__poll_t flags = key_to_poll(key);
   4679
   4680	if (flags & EPOLLHUP) {
   4681		/*
   4682		 * If the event has been detached at cgroup removal, we
   4683		 * can simply return knowing the other side will cleanup
   4684		 * for us.
   4685		 *
   4686		 * We can't race against event freeing since the other
   4687		 * side will require wqh->lock via remove_wait_queue(),
   4688		 * which we hold.
   4689		 */
   4690		spin_lock(&memcg->event_list_lock);
   4691		if (!list_empty(&event->list)) {
   4692			list_del_init(&event->list);
   4693			/*
   4694			 * We are in atomic context, but cgroup_event_remove()
   4695			 * may sleep, so we have to call it in workqueue.
   4696			 */
   4697			schedule_work(&event->remove);
   4698		}
   4699		spin_unlock(&memcg->event_list_lock);
   4700	}
   4701
   4702	return 0;
   4703}
   4704
   4705static void memcg_event_ptable_queue_proc(struct file *file,
   4706		wait_queue_head_t *wqh, poll_table *pt)
   4707{
   4708	struct mem_cgroup_event *event =
   4709		container_of(pt, struct mem_cgroup_event, pt);
   4710
   4711	event->wqh = wqh;
   4712	add_wait_queue(wqh, &event->wait);
   4713}
   4714
   4715/*
   4716 * DO NOT USE IN NEW FILES.
   4717 *
   4718 * Parse input and register new cgroup event handler.
   4719 *
   4720 * Input must be in format '<event_fd> <control_fd> <args>'.
   4721 * Interpretation of args is defined by control file implementation.
   4722 */
   4723static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
   4724					 char *buf, size_t nbytes, loff_t off)
   4725{
   4726	struct cgroup_subsys_state *css = of_css(of);
   4727	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   4728	struct mem_cgroup_event *event;
   4729	struct cgroup_subsys_state *cfile_css;
   4730	unsigned int efd, cfd;
   4731	struct fd efile;
   4732	struct fd cfile;
   4733	const char *name;
   4734	char *endp;
   4735	int ret;
   4736
   4737	if (IS_ENABLED(CONFIG_PREEMPT_RT))
   4738		return -EOPNOTSUPP;
   4739
   4740	buf = strstrip(buf);
   4741
   4742	efd = simple_strtoul(buf, &endp, 10);
   4743	if (*endp != ' ')
   4744		return -EINVAL;
   4745	buf = endp + 1;
   4746
   4747	cfd = simple_strtoul(buf, &endp, 10);
   4748	if ((*endp != ' ') && (*endp != '\0'))
   4749		return -EINVAL;
   4750	buf = endp + 1;
   4751
   4752	event = kzalloc(sizeof(*event), GFP_KERNEL);
   4753	if (!event)
   4754		return -ENOMEM;
   4755
   4756	event->memcg = memcg;
   4757	INIT_LIST_HEAD(&event->list);
   4758	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
   4759	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
   4760	INIT_WORK(&event->remove, memcg_event_remove);
   4761
   4762	efile = fdget(efd);
   4763	if (!efile.file) {
   4764		ret = -EBADF;
   4765		goto out_kfree;
   4766	}
   4767
   4768	event->eventfd = eventfd_ctx_fileget(efile.file);
   4769	if (IS_ERR(event->eventfd)) {
   4770		ret = PTR_ERR(event->eventfd);
   4771		goto out_put_efile;
   4772	}
   4773
   4774	cfile = fdget(cfd);
   4775	if (!cfile.file) {
   4776		ret = -EBADF;
   4777		goto out_put_eventfd;
   4778	}
   4779
   4780	/* the process need read permission on control file */
   4781	/* AV: shouldn't we check that it's been opened for read instead? */
   4782	ret = file_permission(cfile.file, MAY_READ);
   4783	if (ret < 0)
   4784		goto out_put_cfile;
   4785
   4786	/*
   4787	 * Determine the event callbacks and set them in @event.  This used
   4788	 * to be done via struct cftype but cgroup core no longer knows
   4789	 * about these events.  The following is crude but the whole thing
   4790	 * is for compatibility anyway.
   4791	 *
   4792	 * DO NOT ADD NEW FILES.
   4793	 */
   4794	name = cfile.file->f_path.dentry->d_name.name;
   4795
   4796	if (!strcmp(name, "memory.usage_in_bytes")) {
   4797		event->register_event = mem_cgroup_usage_register_event;
   4798		event->unregister_event = mem_cgroup_usage_unregister_event;
   4799	} else if (!strcmp(name, "memory.oom_control")) {
   4800		event->register_event = mem_cgroup_oom_register_event;
   4801		event->unregister_event = mem_cgroup_oom_unregister_event;
   4802	} else if (!strcmp(name, "memory.pressure_level")) {
   4803		event->register_event = vmpressure_register_event;
   4804		event->unregister_event = vmpressure_unregister_event;
   4805	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
   4806		event->register_event = memsw_cgroup_usage_register_event;
   4807		event->unregister_event = memsw_cgroup_usage_unregister_event;
   4808	} else {
   4809		ret = -EINVAL;
   4810		goto out_put_cfile;
   4811	}
   4812
   4813	/*
   4814	 * Verify @cfile should belong to @css.  Also, remaining events are
   4815	 * automatically removed on cgroup destruction but the removal is
   4816	 * asynchronous, so take an extra ref on @css.
   4817	 */
   4818	cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
   4819					       &memory_cgrp_subsys);
   4820	ret = -EINVAL;
   4821	if (IS_ERR(cfile_css))
   4822		goto out_put_cfile;
   4823	if (cfile_css != css) {
   4824		css_put(cfile_css);
   4825		goto out_put_cfile;
   4826	}
   4827
   4828	ret = event->register_event(memcg, event->eventfd, buf);
   4829	if (ret)
   4830		goto out_put_css;
   4831
   4832	vfs_poll(efile.file, &event->pt);
   4833
   4834	spin_lock_irq(&memcg->event_list_lock);
   4835	list_add(&event->list, &memcg->event_list);
   4836	spin_unlock_irq(&memcg->event_list_lock);
   4837
   4838	fdput(cfile);
   4839	fdput(efile);
   4840
   4841	return nbytes;
   4842
   4843out_put_css:
   4844	css_put(css);
   4845out_put_cfile:
   4846	fdput(cfile);
   4847out_put_eventfd:
   4848	eventfd_ctx_put(event->eventfd);
   4849out_put_efile:
   4850	fdput(efile);
   4851out_kfree:
   4852	kfree(event);
   4853
   4854	return ret;
   4855}
   4856
   4857#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
   4858static int mem_cgroup_slab_show(struct seq_file *m, void *p)
   4859{
   4860	/*
   4861	 * Deprecated.
   4862	 * Please, take a look at tools/cgroup/memcg_slabinfo.py .
   4863	 */
   4864	return 0;
   4865}
   4866#endif
   4867
   4868static struct cftype mem_cgroup_legacy_files[] = {
   4869	{
   4870		.name = "usage_in_bytes",
   4871		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
   4872		.read_u64 = mem_cgroup_read_u64,
   4873	},
   4874	{
   4875		.name = "max_usage_in_bytes",
   4876		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
   4877		.write = mem_cgroup_reset,
   4878		.read_u64 = mem_cgroup_read_u64,
   4879	},
   4880	{
   4881		.name = "limit_in_bytes",
   4882		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
   4883		.write = mem_cgroup_write,
   4884		.read_u64 = mem_cgroup_read_u64,
   4885	},
   4886	{
   4887		.name = "soft_limit_in_bytes",
   4888		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
   4889		.write = mem_cgroup_write,
   4890		.read_u64 = mem_cgroup_read_u64,
   4891	},
   4892	{
   4893		.name = "failcnt",
   4894		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
   4895		.write = mem_cgroup_reset,
   4896		.read_u64 = mem_cgroup_read_u64,
   4897	},
   4898	{
   4899		.name = "stat",
   4900		.seq_show = memcg_stat_show,
   4901	},
   4902	{
   4903		.name = "force_empty",
   4904		.write = mem_cgroup_force_empty_write,
   4905	},
   4906	{
   4907		.name = "use_hierarchy",
   4908		.write_u64 = mem_cgroup_hierarchy_write,
   4909		.read_u64 = mem_cgroup_hierarchy_read,
   4910	},
   4911	{
   4912		.name = "cgroup.event_control",		/* XXX: for compat */
   4913		.write = memcg_write_event_control,
   4914		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
   4915	},
   4916	{
   4917		.name = "swappiness",
   4918		.read_u64 = mem_cgroup_swappiness_read,
   4919		.write_u64 = mem_cgroup_swappiness_write,
   4920	},
   4921	{
   4922		.name = "move_charge_at_immigrate",
   4923		.read_u64 = mem_cgroup_move_charge_read,
   4924		.write_u64 = mem_cgroup_move_charge_write,
   4925	},
   4926	{
   4927		.name = "oom_control",
   4928		.seq_show = mem_cgroup_oom_control_read,
   4929		.write_u64 = mem_cgroup_oom_control_write,
   4930	},
   4931	{
   4932		.name = "pressure_level",
   4933	},
   4934#ifdef CONFIG_NUMA
   4935	{
   4936		.name = "numa_stat",
   4937		.seq_show = memcg_numa_stat_show,
   4938	},
   4939#endif
   4940	{
   4941		.name = "kmem.limit_in_bytes",
   4942		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
   4943		.write = mem_cgroup_write,
   4944		.read_u64 = mem_cgroup_read_u64,
   4945	},
   4946	{
   4947		.name = "kmem.usage_in_bytes",
   4948		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
   4949		.read_u64 = mem_cgroup_read_u64,
   4950	},
   4951	{
   4952		.name = "kmem.failcnt",
   4953		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
   4954		.write = mem_cgroup_reset,
   4955		.read_u64 = mem_cgroup_read_u64,
   4956	},
   4957	{
   4958		.name = "kmem.max_usage_in_bytes",
   4959		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
   4960		.write = mem_cgroup_reset,
   4961		.read_u64 = mem_cgroup_read_u64,
   4962	},
   4963#if defined(CONFIG_MEMCG_KMEM) && \
   4964	(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
   4965	{
   4966		.name = "kmem.slabinfo",
   4967		.seq_show = mem_cgroup_slab_show,
   4968	},
   4969#endif
   4970	{
   4971		.name = "kmem.tcp.limit_in_bytes",
   4972		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
   4973		.write = mem_cgroup_write,
   4974		.read_u64 = mem_cgroup_read_u64,
   4975	},
   4976	{
   4977		.name = "kmem.tcp.usage_in_bytes",
   4978		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
   4979		.read_u64 = mem_cgroup_read_u64,
   4980	},
   4981	{
   4982		.name = "kmem.tcp.failcnt",
   4983		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
   4984		.write = mem_cgroup_reset,
   4985		.read_u64 = mem_cgroup_read_u64,
   4986	},
   4987	{
   4988		.name = "kmem.tcp.max_usage_in_bytes",
   4989		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
   4990		.write = mem_cgroup_reset,
   4991		.read_u64 = mem_cgroup_read_u64,
   4992	},
   4993	{ },	/* terminate */
   4994};
   4995
   4996/*
   4997 * Private memory cgroup IDR
   4998 *
   4999 * Swap-out records and page cache shadow entries need to store memcg
   5000 * references in constrained space, so we maintain an ID space that is
   5001 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
   5002 * memory-controlled cgroups to 64k.
   5003 *
   5004 * However, there usually are many references to the offline CSS after
   5005 * the cgroup has been destroyed, such as page cache or reclaimable
   5006 * slab objects, that don't need to hang on to the ID. We want to keep
   5007 * those dead CSS from occupying IDs, or we might quickly exhaust the
   5008 * relatively small ID space and prevent the creation of new cgroups
   5009 * even when there are much fewer than 64k cgroups - possibly none.
   5010 *
   5011 * Maintain a private 16-bit ID space for memcg, and allow the ID to
   5012 * be freed and recycled when it's no longer needed, which is usually
   5013 * when the CSS is offlined.
   5014 *
   5015 * The only exception to that are records of swapped out tmpfs/shmem
   5016 * pages that need to be attributed to live ancestors on swapin. But
   5017 * those references are manageable from userspace.
   5018 */
   5019
   5020static DEFINE_IDR(mem_cgroup_idr);
   5021
   5022static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
   5023{
   5024	if (memcg->id.id > 0) {
   5025		idr_remove(&mem_cgroup_idr, memcg->id.id);
   5026		memcg->id.id = 0;
   5027	}
   5028}
   5029
   5030static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
   5031						  unsigned int n)
   5032{
   5033	refcount_add(n, &memcg->id.ref);
   5034}
   5035
   5036static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
   5037{
   5038	if (refcount_sub_and_test(n, &memcg->id.ref)) {
   5039		mem_cgroup_id_remove(memcg);
   5040
   5041		/* Memcg ID pins CSS */
   5042		css_put(&memcg->css);
   5043	}
   5044}
   5045
   5046static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
   5047{
   5048	mem_cgroup_id_put_many(memcg, 1);
   5049}
   5050
   5051/**
   5052 * mem_cgroup_from_id - look up a memcg from a memcg id
   5053 * @id: the memcg id to look up
   5054 *
   5055 * Caller must hold rcu_read_lock().
   5056 */
   5057struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
   5058{
   5059	WARN_ON_ONCE(!rcu_read_lock_held());
   5060	return idr_find(&mem_cgroup_idr, id);
   5061}
   5062
   5063static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
   5064{
   5065	struct mem_cgroup_per_node *pn;
   5066
   5067	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
   5068	if (!pn)
   5069		return 1;
   5070
   5071	pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
   5072						   GFP_KERNEL_ACCOUNT);
   5073	if (!pn->lruvec_stats_percpu) {
   5074		kfree(pn);
   5075		return 1;
   5076	}
   5077
   5078	lruvec_init(&pn->lruvec);
   5079	pn->memcg = memcg;
   5080
   5081	memcg->nodeinfo[node] = pn;
   5082	return 0;
   5083}
   5084
   5085static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
   5086{
   5087	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
   5088
   5089	if (!pn)
   5090		return;
   5091
   5092	free_percpu(pn->lruvec_stats_percpu);
   5093	kfree(pn);
   5094}
   5095
   5096static void __mem_cgroup_free(struct mem_cgroup *memcg)
   5097{
   5098	int node;
   5099
   5100	for_each_node(node)
   5101		free_mem_cgroup_per_node_info(memcg, node);
   5102	free_percpu(memcg->vmstats_percpu);
   5103	kfree(memcg);
   5104}
   5105
   5106static void mem_cgroup_free(struct mem_cgroup *memcg)
   5107{
   5108	memcg_wb_domain_exit(memcg);
   5109	__mem_cgroup_free(memcg);
   5110}
   5111
   5112static struct mem_cgroup *mem_cgroup_alloc(void)
   5113{
   5114	struct mem_cgroup *memcg;
   5115	int node;
   5116	int __maybe_unused i;
   5117	long error = -ENOMEM;
   5118
   5119	memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
   5120	if (!memcg)
   5121		return ERR_PTR(error);
   5122
   5123	memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
   5124				 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
   5125	if (memcg->id.id < 0) {
   5126		error = memcg->id.id;
   5127		goto fail;
   5128	}
   5129
   5130	memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
   5131						 GFP_KERNEL_ACCOUNT);
   5132	if (!memcg->vmstats_percpu)
   5133		goto fail;
   5134
   5135	for_each_node(node)
   5136		if (alloc_mem_cgroup_per_node_info(memcg, node))
   5137			goto fail;
   5138
   5139	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
   5140		goto fail;
   5141
   5142	INIT_WORK(&memcg->high_work, high_work_func);
   5143	INIT_LIST_HEAD(&memcg->oom_notify);
   5144	mutex_init(&memcg->thresholds_lock);
   5145	spin_lock_init(&memcg->move_lock);
   5146	vmpressure_init(&memcg->vmpressure);
   5147	INIT_LIST_HEAD(&memcg->event_list);
   5148	spin_lock_init(&memcg->event_list_lock);
   5149	memcg->socket_pressure = jiffies;
   5150#ifdef CONFIG_MEMCG_KMEM
   5151	memcg->kmemcg_id = -1;
   5152	INIT_LIST_HEAD(&memcg->objcg_list);
   5153#endif
   5154#ifdef CONFIG_CGROUP_WRITEBACK
   5155	INIT_LIST_HEAD(&memcg->cgwb_list);
   5156	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
   5157		memcg->cgwb_frn[i].done =
   5158			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
   5159#endif
   5160#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   5161	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
   5162	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
   5163	memcg->deferred_split_queue.split_queue_len = 0;
   5164#endif
   5165	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
   5166	return memcg;
   5167fail:
   5168	mem_cgroup_id_remove(memcg);
   5169	__mem_cgroup_free(memcg);
   5170	return ERR_PTR(error);
   5171}
   5172
   5173static struct cgroup_subsys_state * __ref
   5174mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
   5175{
   5176	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
   5177	struct mem_cgroup *memcg, *old_memcg;
   5178
   5179	old_memcg = set_active_memcg(parent);
   5180	memcg = mem_cgroup_alloc();
   5181	set_active_memcg(old_memcg);
   5182	if (IS_ERR(memcg))
   5183		return ERR_CAST(memcg);
   5184
   5185	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
   5186	memcg->soft_limit = PAGE_COUNTER_MAX;
   5187#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
   5188	memcg->zswap_max = PAGE_COUNTER_MAX;
   5189#endif
   5190	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
   5191	if (parent) {
   5192		memcg->swappiness = mem_cgroup_swappiness(parent);
   5193		memcg->oom_kill_disable = parent->oom_kill_disable;
   5194
   5195		page_counter_init(&memcg->memory, &parent->memory);
   5196		page_counter_init(&memcg->swap, &parent->swap);
   5197		page_counter_init(&memcg->kmem, &parent->kmem);
   5198		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
   5199	} else {
   5200		page_counter_init(&memcg->memory, NULL);
   5201		page_counter_init(&memcg->swap, NULL);
   5202		page_counter_init(&memcg->kmem, NULL);
   5203		page_counter_init(&memcg->tcpmem, NULL);
   5204
   5205		root_mem_cgroup = memcg;
   5206		return &memcg->css;
   5207	}
   5208
   5209	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
   5210		static_branch_inc(&memcg_sockets_enabled_key);
   5211
   5212	return &memcg->css;
   5213}
   5214
   5215static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
   5216{
   5217	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   5218
   5219	if (memcg_online_kmem(memcg))
   5220		goto remove_id;
   5221
   5222	/*
   5223	 * A memcg must be visible for expand_shrinker_info()
   5224	 * by the time the maps are allocated. So, we allocate maps
   5225	 * here, when for_each_mem_cgroup() can't skip it.
   5226	 */
   5227	if (alloc_shrinker_info(memcg))
   5228		goto offline_kmem;
   5229
   5230	/* Online state pins memcg ID, memcg ID pins CSS */
   5231	refcount_set(&memcg->id.ref, 1);
   5232	css_get(css);
   5233
   5234	if (unlikely(mem_cgroup_is_root(memcg)))
   5235		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
   5236				   2UL*HZ);
   5237	return 0;
   5238offline_kmem:
   5239	memcg_offline_kmem(memcg);
   5240remove_id:
   5241	mem_cgroup_id_remove(memcg);
   5242	return -ENOMEM;
   5243}
   5244
   5245static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
   5246{
   5247	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   5248	struct mem_cgroup_event *event, *tmp;
   5249
   5250	/*
   5251	 * Unregister events and notify userspace.
   5252	 * Notify userspace about cgroup removing only after rmdir of cgroup
   5253	 * directory to avoid race between userspace and kernelspace.
   5254	 */
   5255	spin_lock_irq(&memcg->event_list_lock);
   5256	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
   5257		list_del_init(&event->list);
   5258		schedule_work(&event->remove);
   5259	}
   5260	spin_unlock_irq(&memcg->event_list_lock);
   5261
   5262	page_counter_set_min(&memcg->memory, 0);
   5263	page_counter_set_low(&memcg->memory, 0);
   5264
   5265	memcg_offline_kmem(memcg);
   5266	reparent_shrinker_deferred(memcg);
   5267	wb_memcg_offline(memcg);
   5268
   5269	drain_all_stock(memcg);
   5270
   5271	mem_cgroup_id_put(memcg);
   5272}
   5273
   5274static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
   5275{
   5276	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   5277
   5278	invalidate_reclaim_iterators(memcg);
   5279}
   5280
   5281static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
   5282{
   5283	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   5284	int __maybe_unused i;
   5285
   5286#ifdef CONFIG_CGROUP_WRITEBACK
   5287	for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
   5288		wb_wait_for_completion(&memcg->cgwb_frn[i].done);
   5289#endif
   5290	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
   5291		static_branch_dec(&memcg_sockets_enabled_key);
   5292
   5293	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
   5294		static_branch_dec(&memcg_sockets_enabled_key);
   5295
   5296	vmpressure_cleanup(&memcg->vmpressure);
   5297	cancel_work_sync(&memcg->high_work);
   5298	mem_cgroup_remove_from_trees(memcg);
   5299	free_shrinker_info(memcg);
   5300	mem_cgroup_free(memcg);
   5301}
   5302
   5303/**
   5304 * mem_cgroup_css_reset - reset the states of a mem_cgroup
   5305 * @css: the target css
   5306 *
   5307 * Reset the states of the mem_cgroup associated with @css.  This is
   5308 * invoked when the userland requests disabling on the default hierarchy
   5309 * but the memcg is pinned through dependency.  The memcg should stop
   5310 * applying policies and should revert to the vanilla state as it may be
   5311 * made visible again.
   5312 *
   5313 * The current implementation only resets the essential configurations.
   5314 * This needs to be expanded to cover all the visible parts.
   5315 */
   5316static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
   5317{
   5318	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   5319
   5320	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
   5321	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
   5322	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
   5323	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
   5324	page_counter_set_min(&memcg->memory, 0);
   5325	page_counter_set_low(&memcg->memory, 0);
   5326	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
   5327	memcg->soft_limit = PAGE_COUNTER_MAX;
   5328	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
   5329	memcg_wb_domain_size_changed(memcg);
   5330}
   5331
   5332static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
   5333{
   5334	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   5335	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
   5336	struct memcg_vmstats_percpu *statc;
   5337	long delta, v;
   5338	int i, nid;
   5339
   5340	statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
   5341
   5342	for (i = 0; i < MEMCG_NR_STAT; i++) {
   5343		/*
   5344		 * Collect the aggregated propagation counts of groups
   5345		 * below us. We're in a per-cpu loop here and this is
   5346		 * a global counter, so the first cycle will get them.
   5347		 */
   5348		delta = memcg->vmstats.state_pending[i];
   5349		if (delta)
   5350			memcg->vmstats.state_pending[i] = 0;
   5351
   5352		/* Add CPU changes on this level since the last flush */
   5353		v = READ_ONCE(statc->state[i]);
   5354		if (v != statc->state_prev[i]) {
   5355			delta += v - statc->state_prev[i];
   5356			statc->state_prev[i] = v;
   5357		}
   5358
   5359		if (!delta)
   5360			continue;
   5361
   5362		/* Aggregate counts on this level and propagate upwards */
   5363		memcg->vmstats.state[i] += delta;
   5364		if (parent)
   5365			parent->vmstats.state_pending[i] += delta;
   5366	}
   5367
   5368	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
   5369		delta = memcg->vmstats.events_pending[i];
   5370		if (delta)
   5371			memcg->vmstats.events_pending[i] = 0;
   5372
   5373		v = READ_ONCE(statc->events[i]);
   5374		if (v != statc->events_prev[i]) {
   5375			delta += v - statc->events_prev[i];
   5376			statc->events_prev[i] = v;
   5377		}
   5378
   5379		if (!delta)
   5380			continue;
   5381
   5382		memcg->vmstats.events[i] += delta;
   5383		if (parent)
   5384			parent->vmstats.events_pending[i] += delta;
   5385	}
   5386
   5387	for_each_node_state(nid, N_MEMORY) {
   5388		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
   5389		struct mem_cgroup_per_node *ppn = NULL;
   5390		struct lruvec_stats_percpu *lstatc;
   5391
   5392		if (parent)
   5393			ppn = parent->nodeinfo[nid];
   5394
   5395		lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
   5396
   5397		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
   5398			delta = pn->lruvec_stats.state_pending[i];
   5399			if (delta)
   5400				pn->lruvec_stats.state_pending[i] = 0;
   5401
   5402			v = READ_ONCE(lstatc->state[i]);
   5403			if (v != lstatc->state_prev[i]) {
   5404				delta += v - lstatc->state_prev[i];
   5405				lstatc->state_prev[i] = v;
   5406			}
   5407
   5408			if (!delta)
   5409				continue;
   5410
   5411			pn->lruvec_stats.state[i] += delta;
   5412			if (ppn)
   5413				ppn->lruvec_stats.state_pending[i] += delta;
   5414		}
   5415	}
   5416}
   5417
   5418#ifdef CONFIG_MMU
   5419/* Handlers for move charge at task migration. */
   5420static int mem_cgroup_do_precharge(unsigned long count)
   5421{
   5422	int ret;
   5423
   5424	/* Try a single bulk charge without reclaim first, kswapd may wake */
   5425	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
   5426	if (!ret) {
   5427		mc.precharge += count;
   5428		return ret;
   5429	}
   5430
   5431	/* Try charges one by one with reclaim, but do not retry */
   5432	while (count--) {
   5433		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
   5434		if (ret)
   5435			return ret;
   5436		mc.precharge++;
   5437		cond_resched();
   5438	}
   5439	return 0;
   5440}
   5441
   5442union mc_target {
   5443	struct page	*page;
   5444	swp_entry_t	ent;
   5445};
   5446
   5447enum mc_target_type {
   5448	MC_TARGET_NONE = 0,
   5449	MC_TARGET_PAGE,
   5450	MC_TARGET_SWAP,
   5451	MC_TARGET_DEVICE,
   5452};
   5453
   5454static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
   5455						unsigned long addr, pte_t ptent)
   5456{
   5457	struct page *page = vm_normal_page(vma, addr, ptent);
   5458
   5459	if (!page || !page_mapped(page))
   5460		return NULL;
   5461	if (PageAnon(page)) {
   5462		if (!(mc.flags & MOVE_ANON))
   5463			return NULL;
   5464	} else {
   5465		if (!(mc.flags & MOVE_FILE))
   5466			return NULL;
   5467	}
   5468	if (!get_page_unless_zero(page))
   5469		return NULL;
   5470
   5471	return page;
   5472}
   5473
   5474#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
   5475static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
   5476			pte_t ptent, swp_entry_t *entry)
   5477{
   5478	struct page *page = NULL;
   5479	swp_entry_t ent = pte_to_swp_entry(ptent);
   5480
   5481	if (!(mc.flags & MOVE_ANON))
   5482		return NULL;
   5483
   5484	/*
   5485	 * Handle device private pages that are not accessible by the CPU, but
   5486	 * stored as special swap entries in the page table.
   5487	 */
   5488	if (is_device_private_entry(ent)) {
   5489		page = pfn_swap_entry_to_page(ent);
   5490		if (!get_page_unless_zero(page))
   5491			return NULL;
   5492		return page;
   5493	}
   5494
   5495	if (non_swap_entry(ent))
   5496		return NULL;
   5497
   5498	/*
   5499	 * Because lookup_swap_cache() updates some statistics counter,
   5500	 * we call find_get_page() with swapper_space directly.
   5501	 */
   5502	page = find_get_page(swap_address_space(ent), swp_offset(ent));
   5503	entry->val = ent.val;
   5504
   5505	return page;
   5506}
   5507#else
   5508static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
   5509			pte_t ptent, swp_entry_t *entry)
   5510{
   5511	return NULL;
   5512}
   5513#endif
   5514
   5515static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
   5516			unsigned long addr, pte_t ptent)
   5517{
   5518	if (!vma->vm_file) /* anonymous vma */
   5519		return NULL;
   5520	if (!(mc.flags & MOVE_FILE))
   5521		return NULL;
   5522
   5523	/* page is moved even if it's not RSS of this task(page-faulted). */
   5524	/* shmem/tmpfs may report page out on swap: account for that too. */
   5525	return find_get_incore_page(vma->vm_file->f_mapping,
   5526			linear_page_index(vma, addr));
   5527}
   5528
   5529/**
   5530 * mem_cgroup_move_account - move account of the page
   5531 * @page: the page
   5532 * @compound: charge the page as compound or small page
   5533 * @from: mem_cgroup which the page is moved from.
   5534 * @to:	mem_cgroup which the page is moved to. @from != @to.
   5535 *
   5536 * The caller must make sure the page is not on LRU (isolate_page() is useful.)
   5537 *
   5538 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
   5539 * from old cgroup.
   5540 */
   5541static int mem_cgroup_move_account(struct page *page,
   5542				   bool compound,
   5543				   struct mem_cgroup *from,
   5544				   struct mem_cgroup *to)
   5545{
   5546	struct folio *folio = page_folio(page);
   5547	struct lruvec *from_vec, *to_vec;
   5548	struct pglist_data *pgdat;
   5549	unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
   5550	int nid, ret;
   5551
   5552	VM_BUG_ON(from == to);
   5553	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
   5554	VM_BUG_ON(compound && !folio_test_large(folio));
   5555
   5556	/*
   5557	 * Prevent mem_cgroup_migrate() from looking at
   5558	 * page's memory cgroup of its source page while we change it.
   5559	 */
   5560	ret = -EBUSY;
   5561	if (!folio_trylock(folio))
   5562		goto out;
   5563
   5564	ret = -EINVAL;
   5565	if (folio_memcg(folio) != from)
   5566		goto out_unlock;
   5567
   5568	pgdat = folio_pgdat(folio);
   5569	from_vec = mem_cgroup_lruvec(from, pgdat);
   5570	to_vec = mem_cgroup_lruvec(to, pgdat);
   5571
   5572	folio_memcg_lock(folio);
   5573
   5574	if (folio_test_anon(folio)) {
   5575		if (folio_mapped(folio)) {
   5576			__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
   5577			__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
   5578			if (folio_test_transhuge(folio)) {
   5579				__mod_lruvec_state(from_vec, NR_ANON_THPS,
   5580						   -nr_pages);
   5581				__mod_lruvec_state(to_vec, NR_ANON_THPS,
   5582						   nr_pages);
   5583			}
   5584		}
   5585	} else {
   5586		__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
   5587		__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
   5588
   5589		if (folio_test_swapbacked(folio)) {
   5590			__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
   5591			__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
   5592		}
   5593
   5594		if (folio_mapped(folio)) {
   5595			__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
   5596			__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
   5597		}
   5598
   5599		if (folio_test_dirty(folio)) {
   5600			struct address_space *mapping = folio_mapping(folio);
   5601
   5602			if (mapping_can_writeback(mapping)) {
   5603				__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
   5604						   -nr_pages);
   5605				__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
   5606						   nr_pages);
   5607			}
   5608		}
   5609	}
   5610
   5611	if (folio_test_writeback(folio)) {
   5612		__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
   5613		__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
   5614	}
   5615
   5616	/*
   5617	 * All state has been migrated, let's switch to the new memcg.
   5618	 *
   5619	 * It is safe to change page's memcg here because the page
   5620	 * is referenced, charged, isolated, and locked: we can't race
   5621	 * with (un)charging, migration, LRU putback, or anything else
   5622	 * that would rely on a stable page's memory cgroup.
   5623	 *
   5624	 * Note that lock_page_memcg is a memcg lock, not a page lock,
   5625	 * to save space. As soon as we switch page's memory cgroup to a
   5626	 * new memcg that isn't locked, the above state can change
   5627	 * concurrently again. Make sure we're truly done with it.
   5628	 */
   5629	smp_mb();
   5630
   5631	css_get(&to->css);
   5632	css_put(&from->css);
   5633
   5634	folio->memcg_data = (unsigned long)to;
   5635
   5636	__folio_memcg_unlock(from);
   5637
   5638	ret = 0;
   5639	nid = folio_nid(folio);
   5640
   5641	local_irq_disable();
   5642	mem_cgroup_charge_statistics(to, nr_pages);
   5643	memcg_check_events(to, nid);
   5644	mem_cgroup_charge_statistics(from, -nr_pages);
   5645	memcg_check_events(from, nid);
   5646	local_irq_enable();
   5647out_unlock:
   5648	folio_unlock(folio);
   5649out:
   5650	return ret;
   5651}
   5652
   5653/**
   5654 * get_mctgt_type - get target type of moving charge
   5655 * @vma: the vma the pte to be checked belongs
   5656 * @addr: the address corresponding to the pte to be checked
   5657 * @ptent: the pte to be checked
   5658 * @target: the pointer the target page or swap ent will be stored(can be NULL)
   5659 *
   5660 * Returns
   5661 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
   5662 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
   5663 *     move charge. if @target is not NULL, the page is stored in target->page
   5664 *     with extra refcnt got(Callers should handle it).
   5665 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
   5666 *     target for charge migration. if @target is not NULL, the entry is stored
   5667 *     in target->ent.
   5668 *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
   5669 *     (so ZONE_DEVICE page and thus not on the lru).
   5670 *     For now we such page is charge like a regular page would be as for all
   5671 *     intent and purposes it is just special memory taking the place of a
   5672 *     regular page.
   5673 *
   5674 *     See Documentations/vm/hmm.txt and include/linux/hmm.h
   5675 *
   5676 * Called with pte lock held.
   5677 */
   5678
   5679static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
   5680		unsigned long addr, pte_t ptent, union mc_target *target)
   5681{
   5682	struct page *page = NULL;
   5683	enum mc_target_type ret = MC_TARGET_NONE;
   5684	swp_entry_t ent = { .val = 0 };
   5685
   5686	if (pte_present(ptent))
   5687		page = mc_handle_present_pte(vma, addr, ptent);
   5688	else if (pte_none_mostly(ptent))
   5689		/*
   5690		 * PTE markers should be treated as a none pte here, separated
   5691		 * from other swap handling below.
   5692		 */
   5693		page = mc_handle_file_pte(vma, addr, ptent);
   5694	else if (is_swap_pte(ptent))
   5695		page = mc_handle_swap_pte(vma, ptent, &ent);
   5696
   5697	if (!page && !ent.val)
   5698		return ret;
   5699	if (page) {
   5700		/*
   5701		 * Do only loose check w/o serialization.
   5702		 * mem_cgroup_move_account() checks the page is valid or
   5703		 * not under LRU exclusion.
   5704		 */
   5705		if (page_memcg(page) == mc.from) {
   5706			ret = MC_TARGET_PAGE;
   5707			if (is_device_private_page(page))
   5708				ret = MC_TARGET_DEVICE;
   5709			if (target)
   5710				target->page = page;
   5711		}
   5712		if (!ret || !target)
   5713			put_page(page);
   5714	}
   5715	/*
   5716	 * There is a swap entry and a page doesn't exist or isn't charged.
   5717	 * But we cannot move a tail-page in a THP.
   5718	 */
   5719	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
   5720	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
   5721		ret = MC_TARGET_SWAP;
   5722		if (target)
   5723			target->ent = ent;
   5724	}
   5725	return ret;
   5726}
   5727
   5728#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   5729/*
   5730 * We don't consider PMD mapped swapping or file mapped pages because THP does
   5731 * not support them for now.
   5732 * Caller should make sure that pmd_trans_huge(pmd) is true.
   5733 */
   5734static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
   5735		unsigned long addr, pmd_t pmd, union mc_target *target)
   5736{
   5737	struct page *page = NULL;
   5738	enum mc_target_type ret = MC_TARGET_NONE;
   5739
   5740	if (unlikely(is_swap_pmd(pmd))) {
   5741		VM_BUG_ON(thp_migration_supported() &&
   5742				  !is_pmd_migration_entry(pmd));
   5743		return ret;
   5744	}
   5745	page = pmd_page(pmd);
   5746	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
   5747	if (!(mc.flags & MOVE_ANON))
   5748		return ret;
   5749	if (page_memcg(page) == mc.from) {
   5750		ret = MC_TARGET_PAGE;
   5751		if (target) {
   5752			get_page(page);
   5753			target->page = page;
   5754		}
   5755	}
   5756	return ret;
   5757}
   5758#else
   5759static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
   5760		unsigned long addr, pmd_t pmd, union mc_target *target)
   5761{
   5762	return MC_TARGET_NONE;
   5763}
   5764#endif
   5765
   5766static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
   5767					unsigned long addr, unsigned long end,
   5768					struct mm_walk *walk)
   5769{
   5770	struct vm_area_struct *vma = walk->vma;
   5771	pte_t *pte;
   5772	spinlock_t *ptl;
   5773
   5774	ptl = pmd_trans_huge_lock(pmd, vma);
   5775	if (ptl) {
   5776		/*
   5777		 * Note their can not be MC_TARGET_DEVICE for now as we do not
   5778		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
   5779		 * this might change.
   5780		 */
   5781		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
   5782			mc.precharge += HPAGE_PMD_NR;
   5783		spin_unlock(ptl);
   5784		return 0;
   5785	}
   5786
   5787	if (pmd_trans_unstable(pmd))
   5788		return 0;
   5789	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
   5790	for (; addr != end; pte++, addr += PAGE_SIZE)
   5791		if (get_mctgt_type(vma, addr, *pte, NULL))
   5792			mc.precharge++;	/* increment precharge temporarily */
   5793	pte_unmap_unlock(pte - 1, ptl);
   5794	cond_resched();
   5795
   5796	return 0;
   5797}
   5798
   5799static const struct mm_walk_ops precharge_walk_ops = {
   5800	.pmd_entry	= mem_cgroup_count_precharge_pte_range,
   5801};
   5802
   5803static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
   5804{
   5805	unsigned long precharge;
   5806
   5807	mmap_read_lock(mm);
   5808	walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
   5809	mmap_read_unlock(mm);
   5810
   5811	precharge = mc.precharge;
   5812	mc.precharge = 0;
   5813
   5814	return precharge;
   5815}
   5816
   5817static int mem_cgroup_precharge_mc(struct mm_struct *mm)
   5818{
   5819	unsigned long precharge = mem_cgroup_count_precharge(mm);
   5820
   5821	VM_BUG_ON(mc.moving_task);
   5822	mc.moving_task = current;
   5823	return mem_cgroup_do_precharge(precharge);
   5824}
   5825
   5826/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
   5827static void __mem_cgroup_clear_mc(void)
   5828{
   5829	struct mem_cgroup *from = mc.from;
   5830	struct mem_cgroup *to = mc.to;
   5831
   5832	/* we must uncharge all the leftover precharges from mc.to */
   5833	if (mc.precharge) {
   5834		cancel_charge(mc.to, mc.precharge);
   5835		mc.precharge = 0;
   5836	}
   5837	/*
   5838	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
   5839	 * we must uncharge here.
   5840	 */
   5841	if (mc.moved_charge) {
   5842		cancel_charge(mc.from, mc.moved_charge);
   5843		mc.moved_charge = 0;
   5844	}
   5845	/* we must fixup refcnts and charges */
   5846	if (mc.moved_swap) {
   5847		/* uncharge swap account from the old cgroup */
   5848		if (!mem_cgroup_is_root(mc.from))
   5849			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
   5850
   5851		mem_cgroup_id_put_many(mc.from, mc.moved_swap);
   5852
   5853		/*
   5854		 * we charged both to->memory and to->memsw, so we
   5855		 * should uncharge to->memory.
   5856		 */
   5857		if (!mem_cgroup_is_root(mc.to))
   5858			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
   5859
   5860		mc.moved_swap = 0;
   5861	}
   5862	memcg_oom_recover(from);
   5863	memcg_oom_recover(to);
   5864	wake_up_all(&mc.waitq);
   5865}
   5866
   5867static void mem_cgroup_clear_mc(void)
   5868{
   5869	struct mm_struct *mm = mc.mm;
   5870
   5871	/*
   5872	 * we must clear moving_task before waking up waiters at the end of
   5873	 * task migration.
   5874	 */
   5875	mc.moving_task = NULL;
   5876	__mem_cgroup_clear_mc();
   5877	spin_lock(&mc.lock);
   5878	mc.from = NULL;
   5879	mc.to = NULL;
   5880	mc.mm = NULL;
   5881	spin_unlock(&mc.lock);
   5882
   5883	mmput(mm);
   5884}
   5885
   5886static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
   5887{
   5888	struct cgroup_subsys_state *css;
   5889	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
   5890	struct mem_cgroup *from;
   5891	struct task_struct *leader, *p;
   5892	struct mm_struct *mm;
   5893	unsigned long move_flags;
   5894	int ret = 0;
   5895
   5896	/* charge immigration isn't supported on the default hierarchy */
   5897	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
   5898		return 0;
   5899
   5900	/*
   5901	 * Multi-process migrations only happen on the default hierarchy
   5902	 * where charge immigration is not used.  Perform charge
   5903	 * immigration if @tset contains a leader and whine if there are
   5904	 * multiple.
   5905	 */
   5906	p = NULL;
   5907	cgroup_taskset_for_each_leader(leader, css, tset) {
   5908		WARN_ON_ONCE(p);
   5909		p = leader;
   5910		memcg = mem_cgroup_from_css(css);
   5911	}
   5912	if (!p)
   5913		return 0;
   5914
   5915	/*
   5916	 * We are now committed to this value whatever it is. Changes in this
   5917	 * tunable will only affect upcoming migrations, not the current one.
   5918	 * So we need to save it, and keep it going.
   5919	 */
   5920	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
   5921	if (!move_flags)
   5922		return 0;
   5923
   5924	from = mem_cgroup_from_task(p);
   5925
   5926	VM_BUG_ON(from == memcg);
   5927
   5928	mm = get_task_mm(p);
   5929	if (!mm)
   5930		return 0;
   5931	/* We move charges only when we move a owner of the mm */
   5932	if (mm->owner == p) {
   5933		VM_BUG_ON(mc.from);
   5934		VM_BUG_ON(mc.to);
   5935		VM_BUG_ON(mc.precharge);
   5936		VM_BUG_ON(mc.moved_charge);
   5937		VM_BUG_ON(mc.moved_swap);
   5938
   5939		spin_lock(&mc.lock);
   5940		mc.mm = mm;
   5941		mc.from = from;
   5942		mc.to = memcg;
   5943		mc.flags = move_flags;
   5944		spin_unlock(&mc.lock);
   5945		/* We set mc.moving_task later */
   5946
   5947		ret = mem_cgroup_precharge_mc(mm);
   5948		if (ret)
   5949			mem_cgroup_clear_mc();
   5950	} else {
   5951		mmput(mm);
   5952	}
   5953	return ret;
   5954}
   5955
   5956static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
   5957{
   5958	if (mc.to)
   5959		mem_cgroup_clear_mc();
   5960}
   5961
   5962static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
   5963				unsigned long addr, unsigned long end,
   5964				struct mm_walk *walk)
   5965{
   5966	int ret = 0;
   5967	struct vm_area_struct *vma = walk->vma;
   5968	pte_t *pte;
   5969	spinlock_t *ptl;
   5970	enum mc_target_type target_type;
   5971	union mc_target target;
   5972	struct page *page;
   5973
   5974	ptl = pmd_trans_huge_lock(pmd, vma);
   5975	if (ptl) {
   5976		if (mc.precharge < HPAGE_PMD_NR) {
   5977			spin_unlock(ptl);
   5978			return 0;
   5979		}
   5980		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
   5981		if (target_type == MC_TARGET_PAGE) {
   5982			page = target.page;
   5983			if (!isolate_lru_page(page)) {
   5984				if (!mem_cgroup_move_account(page, true,
   5985							     mc.from, mc.to)) {
   5986					mc.precharge -= HPAGE_PMD_NR;
   5987					mc.moved_charge += HPAGE_PMD_NR;
   5988				}
   5989				putback_lru_page(page);
   5990			}
   5991			put_page(page);
   5992		} else if (target_type == MC_TARGET_DEVICE) {
   5993			page = target.page;
   5994			if (!mem_cgroup_move_account(page, true,
   5995						     mc.from, mc.to)) {
   5996				mc.precharge -= HPAGE_PMD_NR;
   5997				mc.moved_charge += HPAGE_PMD_NR;
   5998			}
   5999			put_page(page);
   6000		}
   6001		spin_unlock(ptl);
   6002		return 0;
   6003	}
   6004
   6005	if (pmd_trans_unstable(pmd))
   6006		return 0;
   6007retry:
   6008	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
   6009	for (; addr != end; addr += PAGE_SIZE) {
   6010		pte_t ptent = *(pte++);
   6011		bool device = false;
   6012		swp_entry_t ent;
   6013
   6014		if (!mc.precharge)
   6015			break;
   6016
   6017		switch (get_mctgt_type(vma, addr, ptent, &target)) {
   6018		case MC_TARGET_DEVICE:
   6019			device = true;
   6020			fallthrough;
   6021		case MC_TARGET_PAGE:
   6022			page = target.page;
   6023			/*
   6024			 * We can have a part of the split pmd here. Moving it
   6025			 * can be done but it would be too convoluted so simply
   6026			 * ignore such a partial THP and keep it in original
   6027			 * memcg. There should be somebody mapping the head.
   6028			 */
   6029			if (PageTransCompound(page))
   6030				goto put;
   6031			if (!device && isolate_lru_page(page))
   6032				goto put;
   6033			if (!mem_cgroup_move_account(page, false,
   6034						mc.from, mc.to)) {
   6035				mc.precharge--;
   6036				/* we uncharge from mc.from later. */
   6037				mc.moved_charge++;
   6038			}
   6039			if (!device)
   6040				putback_lru_page(page);
   6041put:			/* get_mctgt_type() gets the page */
   6042			put_page(page);
   6043			break;
   6044		case MC_TARGET_SWAP:
   6045			ent = target.ent;
   6046			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
   6047				mc.precharge--;
   6048				mem_cgroup_id_get_many(mc.to, 1);
   6049				/* we fixup other refcnts and charges later. */
   6050				mc.moved_swap++;
   6051			}
   6052			break;
   6053		default:
   6054			break;
   6055		}
   6056	}
   6057	pte_unmap_unlock(pte - 1, ptl);
   6058	cond_resched();
   6059
   6060	if (addr != end) {
   6061		/*
   6062		 * We have consumed all precharges we got in can_attach().
   6063		 * We try charge one by one, but don't do any additional
   6064		 * charges to mc.to if we have failed in charge once in attach()
   6065		 * phase.
   6066		 */
   6067		ret = mem_cgroup_do_precharge(1);
   6068		if (!ret)
   6069			goto retry;
   6070	}
   6071
   6072	return ret;
   6073}
   6074
   6075static const struct mm_walk_ops charge_walk_ops = {
   6076	.pmd_entry	= mem_cgroup_move_charge_pte_range,
   6077};
   6078
   6079static void mem_cgroup_move_charge(void)
   6080{
   6081	lru_add_drain_all();
   6082	/*
   6083	 * Signal lock_page_memcg() to take the memcg's move_lock
   6084	 * while we're moving its pages to another memcg. Then wait
   6085	 * for already started RCU-only updates to finish.
   6086	 */
   6087	atomic_inc(&mc.from->moving_account);
   6088	synchronize_rcu();
   6089retry:
   6090	if (unlikely(!mmap_read_trylock(mc.mm))) {
   6091		/*
   6092		 * Someone who are holding the mmap_lock might be waiting in
   6093		 * waitq. So we cancel all extra charges, wake up all waiters,
   6094		 * and retry. Because we cancel precharges, we might not be able
   6095		 * to move enough charges, but moving charge is a best-effort
   6096		 * feature anyway, so it wouldn't be a big problem.
   6097		 */
   6098		__mem_cgroup_clear_mc();
   6099		cond_resched();
   6100		goto retry;
   6101	}
   6102	/*
   6103	 * When we have consumed all precharges and failed in doing
   6104	 * additional charge, the page walk just aborts.
   6105	 */
   6106	walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
   6107			NULL);
   6108
   6109	mmap_read_unlock(mc.mm);
   6110	atomic_dec(&mc.from->moving_account);
   6111}
   6112
   6113static void mem_cgroup_move_task(void)
   6114{
   6115	if (mc.to) {
   6116		mem_cgroup_move_charge();
   6117		mem_cgroup_clear_mc();
   6118	}
   6119}
   6120#else	/* !CONFIG_MMU */
   6121static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
   6122{
   6123	return 0;
   6124}
   6125static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
   6126{
   6127}
   6128static void mem_cgroup_move_task(void)
   6129{
   6130}
   6131#endif
   6132
   6133static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
   6134{
   6135	if (value == PAGE_COUNTER_MAX)
   6136		seq_puts(m, "max\n");
   6137	else
   6138		seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
   6139
   6140	return 0;
   6141}
   6142
   6143static u64 memory_current_read(struct cgroup_subsys_state *css,
   6144			       struct cftype *cft)
   6145{
   6146	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   6147
   6148	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
   6149}
   6150
   6151static u64 memory_peak_read(struct cgroup_subsys_state *css,
   6152			    struct cftype *cft)
   6153{
   6154	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   6155
   6156	return (u64)memcg->memory.watermark * PAGE_SIZE;
   6157}
   6158
   6159static int memory_min_show(struct seq_file *m, void *v)
   6160{
   6161	return seq_puts_memcg_tunable(m,
   6162		READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
   6163}
   6164
   6165static ssize_t memory_min_write(struct kernfs_open_file *of,
   6166				char *buf, size_t nbytes, loff_t off)
   6167{
   6168	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   6169	unsigned long min;
   6170	int err;
   6171
   6172	buf = strstrip(buf);
   6173	err = page_counter_memparse(buf, "max", &min);
   6174	if (err)
   6175		return err;
   6176
   6177	page_counter_set_min(&memcg->memory, min);
   6178
   6179	return nbytes;
   6180}
   6181
   6182static int memory_low_show(struct seq_file *m, void *v)
   6183{
   6184	return seq_puts_memcg_tunable(m,
   6185		READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
   6186}
   6187
   6188static ssize_t memory_low_write(struct kernfs_open_file *of,
   6189				char *buf, size_t nbytes, loff_t off)
   6190{
   6191	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   6192	unsigned long low;
   6193	int err;
   6194
   6195	buf = strstrip(buf);
   6196	err = page_counter_memparse(buf, "max", &low);
   6197	if (err)
   6198		return err;
   6199
   6200	page_counter_set_low(&memcg->memory, low);
   6201
   6202	return nbytes;
   6203}
   6204
   6205static int memory_high_show(struct seq_file *m, void *v)
   6206{
   6207	return seq_puts_memcg_tunable(m,
   6208		READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
   6209}
   6210
   6211static ssize_t memory_high_write(struct kernfs_open_file *of,
   6212				 char *buf, size_t nbytes, loff_t off)
   6213{
   6214	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   6215	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
   6216	bool drained = false;
   6217	unsigned long high;
   6218	int err;
   6219
   6220	buf = strstrip(buf);
   6221	err = page_counter_memparse(buf, "max", &high);
   6222	if (err)
   6223		return err;
   6224
   6225	page_counter_set_high(&memcg->memory, high);
   6226
   6227	for (;;) {
   6228		unsigned long nr_pages = page_counter_read(&memcg->memory);
   6229		unsigned long reclaimed;
   6230
   6231		if (nr_pages <= high)
   6232			break;
   6233
   6234		if (signal_pending(current))
   6235			break;
   6236
   6237		if (!drained) {
   6238			drain_all_stock(memcg);
   6239			drained = true;
   6240			continue;
   6241		}
   6242
   6243		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
   6244							 GFP_KERNEL, true);
   6245
   6246		if (!reclaimed && !nr_retries--)
   6247			break;
   6248	}
   6249
   6250	memcg_wb_domain_size_changed(memcg);
   6251	return nbytes;
   6252}
   6253
   6254static int memory_max_show(struct seq_file *m, void *v)
   6255{
   6256	return seq_puts_memcg_tunable(m,
   6257		READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
   6258}
   6259
   6260static ssize_t memory_max_write(struct kernfs_open_file *of,
   6261				char *buf, size_t nbytes, loff_t off)
   6262{
   6263	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   6264	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
   6265	bool drained = false;
   6266	unsigned long max;
   6267	int err;
   6268
   6269	buf = strstrip(buf);
   6270	err = page_counter_memparse(buf, "max", &max);
   6271	if (err)
   6272		return err;
   6273
   6274	xchg(&memcg->memory.max, max);
   6275
   6276	for (;;) {
   6277		unsigned long nr_pages = page_counter_read(&memcg->memory);
   6278
   6279		if (nr_pages <= max)
   6280			break;
   6281
   6282		if (signal_pending(current))
   6283			break;
   6284
   6285		if (!drained) {
   6286			drain_all_stock(memcg);
   6287			drained = true;
   6288			continue;
   6289		}
   6290
   6291		if (nr_reclaims) {
   6292			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
   6293							  GFP_KERNEL, true))
   6294				nr_reclaims--;
   6295			continue;
   6296		}
   6297
   6298		memcg_memory_event(memcg, MEMCG_OOM);
   6299		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
   6300			break;
   6301	}
   6302
   6303	memcg_wb_domain_size_changed(memcg);
   6304	return nbytes;
   6305}
   6306
   6307static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
   6308{
   6309	seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
   6310	seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
   6311	seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
   6312	seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
   6313	seq_printf(m, "oom_kill %lu\n",
   6314		   atomic_long_read(&events[MEMCG_OOM_KILL]));
   6315	seq_printf(m, "oom_group_kill %lu\n",
   6316		   atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
   6317}
   6318
   6319static int memory_events_show(struct seq_file *m, void *v)
   6320{
   6321	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
   6322
   6323	__memory_events_show(m, memcg->memory_events);
   6324	return 0;
   6325}
   6326
   6327static int memory_events_local_show(struct seq_file *m, void *v)
   6328{
   6329	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
   6330
   6331	__memory_events_show(m, memcg->memory_events_local);
   6332	return 0;
   6333}
   6334
   6335static int memory_stat_show(struct seq_file *m, void *v)
   6336{
   6337	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
   6338	char *buf;
   6339
   6340	buf = memory_stat_format(memcg);
   6341	if (!buf)
   6342		return -ENOMEM;
   6343	seq_puts(m, buf);
   6344	kfree(buf);
   6345	return 0;
   6346}
   6347
   6348#ifdef CONFIG_NUMA
   6349static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
   6350						     int item)
   6351{
   6352	return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
   6353}
   6354
   6355static int memory_numa_stat_show(struct seq_file *m, void *v)
   6356{
   6357	int i;
   6358	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
   6359
   6360	mem_cgroup_flush_stats();
   6361
   6362	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
   6363		int nid;
   6364
   6365		if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
   6366			continue;
   6367
   6368		seq_printf(m, "%s", memory_stats[i].name);
   6369		for_each_node_state(nid, N_MEMORY) {
   6370			u64 size;
   6371			struct lruvec *lruvec;
   6372
   6373			lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
   6374			size = lruvec_page_state_output(lruvec,
   6375							memory_stats[i].idx);
   6376			seq_printf(m, " N%d=%llu", nid, size);
   6377		}
   6378		seq_putc(m, '\n');
   6379	}
   6380
   6381	return 0;
   6382}
   6383#endif
   6384
   6385static int memory_oom_group_show(struct seq_file *m, void *v)
   6386{
   6387	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
   6388
   6389	seq_printf(m, "%d\n", memcg->oom_group);
   6390
   6391	return 0;
   6392}
   6393
   6394static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
   6395				      char *buf, size_t nbytes, loff_t off)
   6396{
   6397	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   6398	int ret, oom_group;
   6399
   6400	buf = strstrip(buf);
   6401	if (!buf)
   6402		return -EINVAL;
   6403
   6404	ret = kstrtoint(buf, 0, &oom_group);
   6405	if (ret)
   6406		return ret;
   6407
   6408	if (oom_group != 0 && oom_group != 1)
   6409		return -EINVAL;
   6410
   6411	memcg->oom_group = oom_group;
   6412
   6413	return nbytes;
   6414}
   6415
   6416static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
   6417			      size_t nbytes, loff_t off)
   6418{
   6419	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   6420	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
   6421	unsigned long nr_to_reclaim, nr_reclaimed = 0;
   6422	int err;
   6423
   6424	buf = strstrip(buf);
   6425	err = page_counter_memparse(buf, "", &nr_to_reclaim);
   6426	if (err)
   6427		return err;
   6428
   6429	while (nr_reclaimed < nr_to_reclaim) {
   6430		unsigned long reclaimed;
   6431
   6432		if (signal_pending(current))
   6433			return -EINTR;
   6434
   6435		/*
   6436		 * This is the final attempt, drain percpu lru caches in the
   6437		 * hope of introducing more evictable pages for
   6438		 * try_to_free_mem_cgroup_pages().
   6439		 */
   6440		if (!nr_retries)
   6441			lru_add_drain_all();
   6442
   6443		reclaimed = try_to_free_mem_cgroup_pages(memcg,
   6444						nr_to_reclaim - nr_reclaimed,
   6445						GFP_KERNEL, true);
   6446
   6447		if (!reclaimed && !nr_retries--)
   6448			return -EAGAIN;
   6449
   6450		nr_reclaimed += reclaimed;
   6451	}
   6452
   6453	return nbytes;
   6454}
   6455
   6456static struct cftype memory_files[] = {
   6457	{
   6458		.name = "current",
   6459		.flags = CFTYPE_NOT_ON_ROOT,
   6460		.read_u64 = memory_current_read,
   6461	},
   6462	{
   6463		.name = "peak",
   6464		.flags = CFTYPE_NOT_ON_ROOT,
   6465		.read_u64 = memory_peak_read,
   6466	},
   6467	{
   6468		.name = "min",
   6469		.flags = CFTYPE_NOT_ON_ROOT,
   6470		.seq_show = memory_min_show,
   6471		.write = memory_min_write,
   6472	},
   6473	{
   6474		.name = "low",
   6475		.flags = CFTYPE_NOT_ON_ROOT,
   6476		.seq_show = memory_low_show,
   6477		.write = memory_low_write,
   6478	},
   6479	{
   6480		.name = "high",
   6481		.flags = CFTYPE_NOT_ON_ROOT,
   6482		.seq_show = memory_high_show,
   6483		.write = memory_high_write,
   6484	},
   6485	{
   6486		.name = "max",
   6487		.flags = CFTYPE_NOT_ON_ROOT,
   6488		.seq_show = memory_max_show,
   6489		.write = memory_max_write,
   6490	},
   6491	{
   6492		.name = "events",
   6493		.flags = CFTYPE_NOT_ON_ROOT,
   6494		.file_offset = offsetof(struct mem_cgroup, events_file),
   6495		.seq_show = memory_events_show,
   6496	},
   6497	{
   6498		.name = "events.local",
   6499		.flags = CFTYPE_NOT_ON_ROOT,
   6500		.file_offset = offsetof(struct mem_cgroup, events_local_file),
   6501		.seq_show = memory_events_local_show,
   6502	},
   6503	{
   6504		.name = "stat",
   6505		.seq_show = memory_stat_show,
   6506	},
   6507#ifdef CONFIG_NUMA
   6508	{
   6509		.name = "numa_stat",
   6510		.seq_show = memory_numa_stat_show,
   6511	},
   6512#endif
   6513	{
   6514		.name = "oom.group",
   6515		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
   6516		.seq_show = memory_oom_group_show,
   6517		.write = memory_oom_group_write,
   6518	},
   6519	{
   6520		.name = "reclaim",
   6521		.flags = CFTYPE_NS_DELEGATABLE,
   6522		.write = memory_reclaim,
   6523	},
   6524	{ }	/* terminate */
   6525};
   6526
   6527struct cgroup_subsys memory_cgrp_subsys = {
   6528	.css_alloc = mem_cgroup_css_alloc,
   6529	.css_online = mem_cgroup_css_online,
   6530	.css_offline = mem_cgroup_css_offline,
   6531	.css_released = mem_cgroup_css_released,
   6532	.css_free = mem_cgroup_css_free,
   6533	.css_reset = mem_cgroup_css_reset,
   6534	.css_rstat_flush = mem_cgroup_css_rstat_flush,
   6535	.can_attach = mem_cgroup_can_attach,
   6536	.cancel_attach = mem_cgroup_cancel_attach,
   6537	.post_attach = mem_cgroup_move_task,
   6538	.dfl_cftypes = memory_files,
   6539	.legacy_cftypes = mem_cgroup_legacy_files,
   6540	.early_init = 0,
   6541};
   6542
   6543/*
   6544 * This function calculates an individual cgroup's effective
   6545 * protection which is derived from its own memory.min/low, its
   6546 * parent's and siblings' settings, as well as the actual memory
   6547 * distribution in the tree.
   6548 *
   6549 * The following rules apply to the effective protection values:
   6550 *
   6551 * 1. At the first level of reclaim, effective protection is equal to
   6552 *    the declared protection in memory.min and memory.low.
   6553 *
   6554 * 2. To enable safe delegation of the protection configuration, at
   6555 *    subsequent levels the effective protection is capped to the
   6556 *    parent's effective protection.
   6557 *
   6558 * 3. To make complex and dynamic subtrees easier to configure, the
   6559 *    user is allowed to overcommit the declared protection at a given
   6560 *    level. If that is the case, the parent's effective protection is
   6561 *    distributed to the children in proportion to how much protection
   6562 *    they have declared and how much of it they are utilizing.
   6563 *
   6564 *    This makes distribution proportional, but also work-conserving:
   6565 *    if one cgroup claims much more protection than it uses memory,
   6566 *    the unused remainder is available to its siblings.
   6567 *
   6568 * 4. Conversely, when the declared protection is undercommitted at a
   6569 *    given level, the distribution of the larger parental protection
   6570 *    budget is NOT proportional. A cgroup's protection from a sibling
   6571 *    is capped to its own memory.min/low setting.
   6572 *
   6573 * 5. However, to allow protecting recursive subtrees from each other
   6574 *    without having to declare each individual cgroup's fixed share
   6575 *    of the ancestor's claim to protection, any unutilized -
   6576 *    "floating" - protection from up the tree is distributed in
   6577 *    proportion to each cgroup's *usage*. This makes the protection
   6578 *    neutral wrt sibling cgroups and lets them compete freely over
   6579 *    the shared parental protection budget, but it protects the
   6580 *    subtree as a whole from neighboring subtrees.
   6581 *
   6582 * Note that 4. and 5. are not in conflict: 4. is about protecting
   6583 * against immediate siblings whereas 5. is about protecting against
   6584 * neighboring subtrees.
   6585 */
   6586static unsigned long effective_protection(unsigned long usage,
   6587					  unsigned long parent_usage,
   6588					  unsigned long setting,
   6589					  unsigned long parent_effective,
   6590					  unsigned long siblings_protected)
   6591{
   6592	unsigned long protected;
   6593	unsigned long ep;
   6594
   6595	protected = min(usage, setting);
   6596	/*
   6597	 * If all cgroups at this level combined claim and use more
   6598	 * protection then what the parent affords them, distribute
   6599	 * shares in proportion to utilization.
   6600	 *
   6601	 * We are using actual utilization rather than the statically
   6602	 * claimed protection in order to be work-conserving: claimed
   6603	 * but unused protection is available to siblings that would
   6604	 * otherwise get a smaller chunk than what they claimed.
   6605	 */
   6606	if (siblings_protected > parent_effective)
   6607		return protected * parent_effective / siblings_protected;
   6608
   6609	/*
   6610	 * Ok, utilized protection of all children is within what the
   6611	 * parent affords them, so we know whatever this child claims
   6612	 * and utilizes is effectively protected.
   6613	 *
   6614	 * If there is unprotected usage beyond this value, reclaim
   6615	 * will apply pressure in proportion to that amount.
   6616	 *
   6617	 * If there is unutilized protection, the cgroup will be fully
   6618	 * shielded from reclaim, but we do return a smaller value for
   6619	 * protection than what the group could enjoy in theory. This
   6620	 * is okay. With the overcommit distribution above, effective
   6621	 * protection is always dependent on how memory is actually
   6622	 * consumed among the siblings anyway.
   6623	 */
   6624	ep = protected;
   6625
   6626	/*
   6627	 * If the children aren't claiming (all of) the protection
   6628	 * afforded to them by the parent, distribute the remainder in
   6629	 * proportion to the (unprotected) memory of each cgroup. That
   6630	 * way, cgroups that aren't explicitly prioritized wrt each
   6631	 * other compete freely over the allowance, but they are
   6632	 * collectively protected from neighboring trees.
   6633	 *
   6634	 * We're using unprotected memory for the weight so that if
   6635	 * some cgroups DO claim explicit protection, we don't protect
   6636	 * the same bytes twice.
   6637	 *
   6638	 * Check both usage and parent_usage against the respective
   6639	 * protected values. One should imply the other, but they
   6640	 * aren't read atomically - make sure the division is sane.
   6641	 */
   6642	if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
   6643		return ep;
   6644	if (parent_effective > siblings_protected &&
   6645	    parent_usage > siblings_protected &&
   6646	    usage > protected) {
   6647		unsigned long unclaimed;
   6648
   6649		unclaimed = parent_effective - siblings_protected;
   6650		unclaimed *= usage - protected;
   6651		unclaimed /= parent_usage - siblings_protected;
   6652
   6653		ep += unclaimed;
   6654	}
   6655
   6656	return ep;
   6657}
   6658
   6659/**
   6660 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
   6661 * @root: the top ancestor of the sub-tree being checked
   6662 * @memcg: the memory cgroup to check
   6663 *
   6664 * WARNING: This function is not stateless! It can only be used as part
   6665 *          of a top-down tree iteration, not for isolated queries.
   6666 */
   6667void mem_cgroup_calculate_protection(struct mem_cgroup *root,
   6668				     struct mem_cgroup *memcg)
   6669{
   6670	unsigned long usage, parent_usage;
   6671	struct mem_cgroup *parent;
   6672
   6673	if (mem_cgroup_disabled())
   6674		return;
   6675
   6676	if (!root)
   6677		root = root_mem_cgroup;
   6678
   6679	/*
   6680	 * Effective values of the reclaim targets are ignored so they
   6681	 * can be stale. Have a look at mem_cgroup_protection for more
   6682	 * details.
   6683	 * TODO: calculation should be more robust so that we do not need
   6684	 * that special casing.
   6685	 */
   6686	if (memcg == root)
   6687		return;
   6688
   6689	usage = page_counter_read(&memcg->memory);
   6690	if (!usage)
   6691		return;
   6692
   6693	parent = parent_mem_cgroup(memcg);
   6694
   6695	if (parent == root) {
   6696		memcg->memory.emin = READ_ONCE(memcg->memory.min);
   6697		memcg->memory.elow = READ_ONCE(memcg->memory.low);
   6698		return;
   6699	}
   6700
   6701	parent_usage = page_counter_read(&parent->memory);
   6702
   6703	WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
   6704			READ_ONCE(memcg->memory.min),
   6705			READ_ONCE(parent->memory.emin),
   6706			atomic_long_read(&parent->memory.children_min_usage)));
   6707
   6708	WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
   6709			READ_ONCE(memcg->memory.low),
   6710			READ_ONCE(parent->memory.elow),
   6711			atomic_long_read(&parent->memory.children_low_usage)));
   6712}
   6713
   6714static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
   6715			gfp_t gfp)
   6716{
   6717	long nr_pages = folio_nr_pages(folio);
   6718	int ret;
   6719
   6720	ret = try_charge(memcg, gfp, nr_pages);
   6721	if (ret)
   6722		goto out;
   6723
   6724	css_get(&memcg->css);
   6725	commit_charge(folio, memcg);
   6726
   6727	local_irq_disable();
   6728	mem_cgroup_charge_statistics(memcg, nr_pages);
   6729	memcg_check_events(memcg, folio_nid(folio));
   6730	local_irq_enable();
   6731out:
   6732	return ret;
   6733}
   6734
   6735int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
   6736{
   6737	struct mem_cgroup *memcg;
   6738	int ret;
   6739
   6740	memcg = get_mem_cgroup_from_mm(mm);
   6741	ret = charge_memcg(folio, memcg, gfp);
   6742	css_put(&memcg->css);
   6743
   6744	return ret;
   6745}
   6746
   6747/**
   6748 * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
   6749 * @page: page to charge
   6750 * @mm: mm context of the victim
   6751 * @gfp: reclaim mode
   6752 * @entry: swap entry for which the page is allocated
   6753 *
   6754 * This function charges a page allocated for swapin. Please call this before
   6755 * adding the page to the swapcache.
   6756 *
   6757 * Returns 0 on success. Otherwise, an error code is returned.
   6758 */
   6759int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
   6760				  gfp_t gfp, swp_entry_t entry)
   6761{
   6762	struct folio *folio = page_folio(page);
   6763	struct mem_cgroup *memcg;
   6764	unsigned short id;
   6765	int ret;
   6766
   6767	if (mem_cgroup_disabled())
   6768		return 0;
   6769
   6770	id = lookup_swap_cgroup_id(entry);
   6771	rcu_read_lock();
   6772	memcg = mem_cgroup_from_id(id);
   6773	if (!memcg || !css_tryget_online(&memcg->css))
   6774		memcg = get_mem_cgroup_from_mm(mm);
   6775	rcu_read_unlock();
   6776
   6777	ret = charge_memcg(folio, memcg, gfp);
   6778
   6779	css_put(&memcg->css);
   6780	return ret;
   6781}
   6782
   6783/*
   6784 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
   6785 * @entry: swap entry for which the page is charged
   6786 *
   6787 * Call this function after successfully adding the charged page to swapcache.
   6788 *
   6789 * Note: This function assumes the page for which swap slot is being uncharged
   6790 * is order 0 page.
   6791 */
   6792void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
   6793{
   6794	/*
   6795	 * Cgroup1's unified memory+swap counter has been charged with the
   6796	 * new swapcache page, finish the transfer by uncharging the swap
   6797	 * slot. The swap slot would also get uncharged when it dies, but
   6798	 * it can stick around indefinitely and we'd count the page twice
   6799	 * the entire time.
   6800	 *
   6801	 * Cgroup2 has separate resource counters for memory and swap,
   6802	 * so this is a non-issue here. Memory and swap charge lifetimes
   6803	 * correspond 1:1 to page and swap slot lifetimes: we charge the
   6804	 * page to memory here, and uncharge swap when the slot is freed.
   6805	 */
   6806	if (!mem_cgroup_disabled() && do_memsw_account()) {
   6807		/*
   6808		 * The swap entry might not get freed for a long time,
   6809		 * let's not wait for it.  The page already received a
   6810		 * memory+swap charge, drop the swap entry duplicate.
   6811		 */
   6812		mem_cgroup_uncharge_swap(entry, 1);
   6813	}
   6814}
   6815
   6816struct uncharge_gather {
   6817	struct mem_cgroup *memcg;
   6818	unsigned long nr_memory;
   6819	unsigned long pgpgout;
   6820	unsigned long nr_kmem;
   6821	int nid;
   6822};
   6823
   6824static inline void uncharge_gather_clear(struct uncharge_gather *ug)
   6825{
   6826	memset(ug, 0, sizeof(*ug));
   6827}
   6828
   6829static void uncharge_batch(const struct uncharge_gather *ug)
   6830{
   6831	unsigned long flags;
   6832
   6833	if (ug->nr_memory) {
   6834		page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
   6835		if (do_memsw_account())
   6836			page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
   6837		if (ug->nr_kmem)
   6838			memcg_account_kmem(ug->memcg, -ug->nr_kmem);
   6839		memcg_oom_recover(ug->memcg);
   6840	}
   6841
   6842	local_irq_save(flags);
   6843	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
   6844	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
   6845	memcg_check_events(ug->memcg, ug->nid);
   6846	local_irq_restore(flags);
   6847
   6848	/* drop reference from uncharge_folio */
   6849	css_put(&ug->memcg->css);
   6850}
   6851
   6852static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
   6853{
   6854	long nr_pages;
   6855	struct mem_cgroup *memcg;
   6856	struct obj_cgroup *objcg;
   6857
   6858	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
   6859
   6860	/*
   6861	 * Nobody should be changing or seriously looking at
   6862	 * folio memcg or objcg at this point, we have fully
   6863	 * exclusive access to the folio.
   6864	 */
   6865	if (folio_memcg_kmem(folio)) {
   6866		objcg = __folio_objcg(folio);
   6867		/*
   6868		 * This get matches the put at the end of the function and
   6869		 * kmem pages do not hold memcg references anymore.
   6870		 */
   6871		memcg = get_mem_cgroup_from_objcg(objcg);
   6872	} else {
   6873		memcg = __folio_memcg(folio);
   6874	}
   6875
   6876	if (!memcg)
   6877		return;
   6878
   6879	if (ug->memcg != memcg) {
   6880		if (ug->memcg) {
   6881			uncharge_batch(ug);
   6882			uncharge_gather_clear(ug);
   6883		}
   6884		ug->memcg = memcg;
   6885		ug->nid = folio_nid(folio);
   6886
   6887		/* pairs with css_put in uncharge_batch */
   6888		css_get(&memcg->css);
   6889	}
   6890
   6891	nr_pages = folio_nr_pages(folio);
   6892
   6893	if (folio_memcg_kmem(folio)) {
   6894		ug->nr_memory += nr_pages;
   6895		ug->nr_kmem += nr_pages;
   6896
   6897		folio->memcg_data = 0;
   6898		obj_cgroup_put(objcg);
   6899	} else {
   6900		/* LRU pages aren't accounted at the root level */
   6901		if (!mem_cgroup_is_root(memcg))
   6902			ug->nr_memory += nr_pages;
   6903		ug->pgpgout++;
   6904
   6905		folio->memcg_data = 0;
   6906	}
   6907
   6908	css_put(&memcg->css);
   6909}
   6910
   6911void __mem_cgroup_uncharge(struct folio *folio)
   6912{
   6913	struct uncharge_gather ug;
   6914
   6915	/* Don't touch folio->lru of any random page, pre-check: */
   6916	if (!folio_memcg(folio))
   6917		return;
   6918
   6919	uncharge_gather_clear(&ug);
   6920	uncharge_folio(folio, &ug);
   6921	uncharge_batch(&ug);
   6922}
   6923
   6924/**
   6925 * __mem_cgroup_uncharge_list - uncharge a list of page
   6926 * @page_list: list of pages to uncharge
   6927 *
   6928 * Uncharge a list of pages previously charged with
   6929 * __mem_cgroup_charge().
   6930 */
   6931void __mem_cgroup_uncharge_list(struct list_head *page_list)
   6932{
   6933	struct uncharge_gather ug;
   6934	struct folio *folio;
   6935
   6936	uncharge_gather_clear(&ug);
   6937	list_for_each_entry(folio, page_list, lru)
   6938		uncharge_folio(folio, &ug);
   6939	if (ug.memcg)
   6940		uncharge_batch(&ug);
   6941}
   6942
   6943/**
   6944 * mem_cgroup_migrate - Charge a folio's replacement.
   6945 * @old: Currently circulating folio.
   6946 * @new: Replacement folio.
   6947 *
   6948 * Charge @new as a replacement folio for @old. @old will
   6949 * be uncharged upon free.
   6950 *
   6951 * Both folios must be locked, @new->mapping must be set up.
   6952 */
   6953void mem_cgroup_migrate(struct folio *old, struct folio *new)
   6954{
   6955	struct mem_cgroup *memcg;
   6956	long nr_pages = folio_nr_pages(new);
   6957	unsigned long flags;
   6958
   6959	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
   6960	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
   6961	VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
   6962	VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
   6963
   6964	if (mem_cgroup_disabled())
   6965		return;
   6966
   6967	/* Page cache replacement: new folio already charged? */
   6968	if (folio_memcg(new))
   6969		return;
   6970
   6971	memcg = folio_memcg(old);
   6972	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
   6973	if (!memcg)
   6974		return;
   6975
   6976	/* Force-charge the new page. The old one will be freed soon */
   6977	if (!mem_cgroup_is_root(memcg)) {
   6978		page_counter_charge(&memcg->memory, nr_pages);
   6979		if (do_memsw_account())
   6980			page_counter_charge(&memcg->memsw, nr_pages);
   6981	}
   6982
   6983	css_get(&memcg->css);
   6984	commit_charge(new, memcg);
   6985
   6986	local_irq_save(flags);
   6987	mem_cgroup_charge_statistics(memcg, nr_pages);
   6988	memcg_check_events(memcg, folio_nid(new));
   6989	local_irq_restore(flags);
   6990}
   6991
   6992DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
   6993EXPORT_SYMBOL(memcg_sockets_enabled_key);
   6994
   6995void mem_cgroup_sk_alloc(struct sock *sk)
   6996{
   6997	struct mem_cgroup *memcg;
   6998
   6999	if (!mem_cgroup_sockets_enabled)
   7000		return;
   7001
   7002	/* Do not associate the sock with unrelated interrupted task's memcg. */
   7003	if (!in_task())
   7004		return;
   7005
   7006	rcu_read_lock();
   7007	memcg = mem_cgroup_from_task(current);
   7008	if (memcg == root_mem_cgroup)
   7009		goto out;
   7010	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
   7011		goto out;
   7012	if (css_tryget(&memcg->css))
   7013		sk->sk_memcg = memcg;
   7014out:
   7015	rcu_read_unlock();
   7016}
   7017
   7018void mem_cgroup_sk_free(struct sock *sk)
   7019{
   7020	if (sk->sk_memcg)
   7021		css_put(&sk->sk_memcg->css);
   7022}
   7023
   7024/**
   7025 * mem_cgroup_charge_skmem - charge socket memory
   7026 * @memcg: memcg to charge
   7027 * @nr_pages: number of pages to charge
   7028 * @gfp_mask: reclaim mode
   7029 *
   7030 * Charges @nr_pages to @memcg. Returns %true if the charge fit within
   7031 * @memcg's configured limit, %false if it doesn't.
   7032 */
   7033bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
   7034			     gfp_t gfp_mask)
   7035{
   7036	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
   7037		struct page_counter *fail;
   7038
   7039		if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
   7040			memcg->tcpmem_pressure = 0;
   7041			return true;
   7042		}
   7043		memcg->tcpmem_pressure = 1;
   7044		if (gfp_mask & __GFP_NOFAIL) {
   7045			page_counter_charge(&memcg->tcpmem, nr_pages);
   7046			return true;
   7047		}
   7048		return false;
   7049	}
   7050
   7051	if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
   7052		mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
   7053		return true;
   7054	}
   7055
   7056	return false;
   7057}
   7058
   7059/**
   7060 * mem_cgroup_uncharge_skmem - uncharge socket memory
   7061 * @memcg: memcg to uncharge
   7062 * @nr_pages: number of pages to uncharge
   7063 */
   7064void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
   7065{
   7066	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
   7067		page_counter_uncharge(&memcg->tcpmem, nr_pages);
   7068		return;
   7069	}
   7070
   7071	mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
   7072
   7073	refill_stock(memcg, nr_pages);
   7074}
   7075
   7076static int __init cgroup_memory(char *s)
   7077{
   7078	char *token;
   7079
   7080	while ((token = strsep(&s, ",")) != NULL) {
   7081		if (!*token)
   7082			continue;
   7083		if (!strcmp(token, "nosocket"))
   7084			cgroup_memory_nosocket = true;
   7085		if (!strcmp(token, "nokmem"))
   7086			cgroup_memory_nokmem = true;
   7087	}
   7088	return 1;
   7089}
   7090__setup("cgroup.memory=", cgroup_memory);
   7091
   7092/*
   7093 * subsys_initcall() for memory controller.
   7094 *
   7095 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
   7096 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
   7097 * basically everything that doesn't depend on a specific mem_cgroup structure
   7098 * should be initialized from here.
   7099 */
   7100static int __init mem_cgroup_init(void)
   7101{
   7102	int cpu, node;
   7103
   7104	/*
   7105	 * Currently s32 type (can refer to struct batched_lruvec_stat) is
   7106	 * used for per-memcg-per-cpu caching of per-node statistics. In order
   7107	 * to work fine, we should make sure that the overfill threshold can't
   7108	 * exceed S32_MAX / PAGE_SIZE.
   7109	 */
   7110	BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
   7111
   7112	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
   7113				  memcg_hotplug_cpu_dead);
   7114
   7115	for_each_possible_cpu(cpu)
   7116		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
   7117			  drain_local_stock);
   7118
   7119	for_each_node(node) {
   7120		struct mem_cgroup_tree_per_node *rtpn;
   7121
   7122		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
   7123				    node_online(node) ? node : NUMA_NO_NODE);
   7124
   7125		rtpn->rb_root = RB_ROOT;
   7126		rtpn->rb_rightmost = NULL;
   7127		spin_lock_init(&rtpn->lock);
   7128		soft_limit_tree.rb_tree_per_node[node] = rtpn;
   7129	}
   7130
   7131	return 0;
   7132}
   7133subsys_initcall(mem_cgroup_init);
   7134
   7135#ifdef CONFIG_MEMCG_SWAP
   7136static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
   7137{
   7138	while (!refcount_inc_not_zero(&memcg->id.ref)) {
   7139		/*
   7140		 * The root cgroup cannot be destroyed, so it's refcount must
   7141		 * always be >= 1.
   7142		 */
   7143		if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
   7144			VM_BUG_ON(1);
   7145			break;
   7146		}
   7147		memcg = parent_mem_cgroup(memcg);
   7148		if (!memcg)
   7149			memcg = root_mem_cgroup;
   7150	}
   7151	return memcg;
   7152}
   7153
   7154/**
   7155 * mem_cgroup_swapout - transfer a memsw charge to swap
   7156 * @folio: folio whose memsw charge to transfer
   7157 * @entry: swap entry to move the charge to
   7158 *
   7159 * Transfer the memsw charge of @folio to @entry.
   7160 */
   7161void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
   7162{
   7163	struct mem_cgroup *memcg, *swap_memcg;
   7164	unsigned int nr_entries;
   7165	unsigned short oldid;
   7166
   7167	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
   7168	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
   7169
   7170	if (mem_cgroup_disabled())
   7171		return;
   7172
   7173	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
   7174		return;
   7175
   7176	memcg = folio_memcg(folio);
   7177
   7178	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
   7179	if (!memcg)
   7180		return;
   7181
   7182	/*
   7183	 * In case the memcg owning these pages has been offlined and doesn't
   7184	 * have an ID allocated to it anymore, charge the closest online
   7185	 * ancestor for the swap instead and transfer the memory+swap charge.
   7186	 */
   7187	swap_memcg = mem_cgroup_id_get_online(memcg);
   7188	nr_entries = folio_nr_pages(folio);
   7189	/* Get references for the tail pages, too */
   7190	if (nr_entries > 1)
   7191		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
   7192	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
   7193				   nr_entries);
   7194	VM_BUG_ON_FOLIO(oldid, folio);
   7195	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
   7196
   7197	folio->memcg_data = 0;
   7198
   7199	if (!mem_cgroup_is_root(memcg))
   7200		page_counter_uncharge(&memcg->memory, nr_entries);
   7201
   7202	if (!cgroup_memory_noswap && memcg != swap_memcg) {
   7203		if (!mem_cgroup_is_root(swap_memcg))
   7204			page_counter_charge(&swap_memcg->memsw, nr_entries);
   7205		page_counter_uncharge(&memcg->memsw, nr_entries);
   7206	}
   7207
   7208	/*
   7209	 * Interrupts should be disabled here because the caller holds the
   7210	 * i_pages lock which is taken with interrupts-off. It is
   7211	 * important here to have the interrupts disabled because it is the
   7212	 * only synchronisation we have for updating the per-CPU variables.
   7213	 */
   7214	memcg_stats_lock();
   7215	mem_cgroup_charge_statistics(memcg, -nr_entries);
   7216	memcg_stats_unlock();
   7217	memcg_check_events(memcg, folio_nid(folio));
   7218
   7219	css_put(&memcg->css);
   7220}
   7221
   7222/**
   7223 * __mem_cgroup_try_charge_swap - try charging swap space for a folio
   7224 * @folio: folio being added to swap
   7225 * @entry: swap entry to charge
   7226 *
   7227 * Try to charge @folio's memcg for the swap space at @entry.
   7228 *
   7229 * Returns 0 on success, -ENOMEM on failure.
   7230 */
   7231int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
   7232{
   7233	unsigned int nr_pages = folio_nr_pages(folio);
   7234	struct page_counter *counter;
   7235	struct mem_cgroup *memcg;
   7236	unsigned short oldid;
   7237
   7238	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
   7239		return 0;
   7240
   7241	memcg = folio_memcg(folio);
   7242
   7243	VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
   7244	if (!memcg)
   7245		return 0;
   7246
   7247	if (!entry.val) {
   7248		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
   7249		return 0;
   7250	}
   7251
   7252	memcg = mem_cgroup_id_get_online(memcg);
   7253
   7254	if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
   7255	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
   7256		memcg_memory_event(memcg, MEMCG_SWAP_MAX);
   7257		memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
   7258		mem_cgroup_id_put(memcg);
   7259		return -ENOMEM;
   7260	}
   7261
   7262	/* Get references for the tail pages, too */
   7263	if (nr_pages > 1)
   7264		mem_cgroup_id_get_many(memcg, nr_pages - 1);
   7265	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
   7266	VM_BUG_ON_FOLIO(oldid, folio);
   7267	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
   7268
   7269	return 0;
   7270}
   7271
   7272/**
   7273 * __mem_cgroup_uncharge_swap - uncharge swap space
   7274 * @entry: swap entry to uncharge
   7275 * @nr_pages: the amount of swap space to uncharge
   7276 */
   7277void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
   7278{
   7279	struct mem_cgroup *memcg;
   7280	unsigned short id;
   7281
   7282	id = swap_cgroup_record(entry, 0, nr_pages);
   7283	rcu_read_lock();
   7284	memcg = mem_cgroup_from_id(id);
   7285	if (memcg) {
   7286		if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
   7287			if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
   7288				page_counter_uncharge(&memcg->swap, nr_pages);
   7289			else
   7290				page_counter_uncharge(&memcg->memsw, nr_pages);
   7291		}
   7292		mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
   7293		mem_cgroup_id_put_many(memcg, nr_pages);
   7294	}
   7295	rcu_read_unlock();
   7296}
   7297
   7298long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
   7299{
   7300	long nr_swap_pages = get_nr_swap_pages();
   7301
   7302	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
   7303		return nr_swap_pages;
   7304	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
   7305		nr_swap_pages = min_t(long, nr_swap_pages,
   7306				      READ_ONCE(memcg->swap.max) -
   7307				      page_counter_read(&memcg->swap));
   7308	return nr_swap_pages;
   7309}
   7310
   7311bool mem_cgroup_swap_full(struct page *page)
   7312{
   7313	struct mem_cgroup *memcg;
   7314
   7315	VM_BUG_ON_PAGE(!PageLocked(page), page);
   7316
   7317	if (vm_swap_full())
   7318		return true;
   7319	if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
   7320		return false;
   7321
   7322	memcg = page_memcg(page);
   7323	if (!memcg)
   7324		return false;
   7325
   7326	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
   7327		unsigned long usage = page_counter_read(&memcg->swap);
   7328
   7329		if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
   7330		    usage * 2 >= READ_ONCE(memcg->swap.max))
   7331			return true;
   7332	}
   7333
   7334	return false;
   7335}
   7336
   7337static int __init setup_swap_account(char *s)
   7338{
   7339	if (!strcmp(s, "1"))
   7340		cgroup_memory_noswap = false;
   7341	else if (!strcmp(s, "0"))
   7342		cgroup_memory_noswap = true;
   7343	return 1;
   7344}
   7345__setup("swapaccount=", setup_swap_account);
   7346
   7347static u64 swap_current_read(struct cgroup_subsys_state *css,
   7348			     struct cftype *cft)
   7349{
   7350	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   7351
   7352	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
   7353}
   7354
   7355static int swap_high_show(struct seq_file *m, void *v)
   7356{
   7357	return seq_puts_memcg_tunable(m,
   7358		READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
   7359}
   7360
   7361static ssize_t swap_high_write(struct kernfs_open_file *of,
   7362			       char *buf, size_t nbytes, loff_t off)
   7363{
   7364	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   7365	unsigned long high;
   7366	int err;
   7367
   7368	buf = strstrip(buf);
   7369	err = page_counter_memparse(buf, "max", &high);
   7370	if (err)
   7371		return err;
   7372
   7373	page_counter_set_high(&memcg->swap, high);
   7374
   7375	return nbytes;
   7376}
   7377
   7378static int swap_max_show(struct seq_file *m, void *v)
   7379{
   7380	return seq_puts_memcg_tunable(m,
   7381		READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
   7382}
   7383
   7384static ssize_t swap_max_write(struct kernfs_open_file *of,
   7385			      char *buf, size_t nbytes, loff_t off)
   7386{
   7387	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   7388	unsigned long max;
   7389	int err;
   7390
   7391	buf = strstrip(buf);
   7392	err = page_counter_memparse(buf, "max", &max);
   7393	if (err)
   7394		return err;
   7395
   7396	xchg(&memcg->swap.max, max);
   7397
   7398	return nbytes;
   7399}
   7400
   7401static int swap_events_show(struct seq_file *m, void *v)
   7402{
   7403	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
   7404
   7405	seq_printf(m, "high %lu\n",
   7406		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
   7407	seq_printf(m, "max %lu\n",
   7408		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
   7409	seq_printf(m, "fail %lu\n",
   7410		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
   7411
   7412	return 0;
   7413}
   7414
   7415static struct cftype swap_files[] = {
   7416	{
   7417		.name = "swap.current",
   7418		.flags = CFTYPE_NOT_ON_ROOT,
   7419		.read_u64 = swap_current_read,
   7420	},
   7421	{
   7422		.name = "swap.high",
   7423		.flags = CFTYPE_NOT_ON_ROOT,
   7424		.seq_show = swap_high_show,
   7425		.write = swap_high_write,
   7426	},
   7427	{
   7428		.name = "swap.max",
   7429		.flags = CFTYPE_NOT_ON_ROOT,
   7430		.seq_show = swap_max_show,
   7431		.write = swap_max_write,
   7432	},
   7433	{
   7434		.name = "swap.events",
   7435		.flags = CFTYPE_NOT_ON_ROOT,
   7436		.file_offset = offsetof(struct mem_cgroup, swap_events_file),
   7437		.seq_show = swap_events_show,
   7438	},
   7439	{ }	/* terminate */
   7440};
   7441
   7442static struct cftype memsw_files[] = {
   7443	{
   7444		.name = "memsw.usage_in_bytes",
   7445		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
   7446		.read_u64 = mem_cgroup_read_u64,
   7447	},
   7448	{
   7449		.name = "memsw.max_usage_in_bytes",
   7450		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
   7451		.write = mem_cgroup_reset,
   7452		.read_u64 = mem_cgroup_read_u64,
   7453	},
   7454	{
   7455		.name = "memsw.limit_in_bytes",
   7456		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
   7457		.write = mem_cgroup_write,
   7458		.read_u64 = mem_cgroup_read_u64,
   7459	},
   7460	{
   7461		.name = "memsw.failcnt",
   7462		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
   7463		.write = mem_cgroup_reset,
   7464		.read_u64 = mem_cgroup_read_u64,
   7465	},
   7466	{ },	/* terminate */
   7467};
   7468
   7469#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
   7470/**
   7471 * obj_cgroup_may_zswap - check if this cgroup can zswap
   7472 * @objcg: the object cgroup
   7473 *
   7474 * Check if the hierarchical zswap limit has been reached.
   7475 *
   7476 * This doesn't check for specific headroom, and it is not atomic
   7477 * either. But with zswap, the size of the allocation is only known
   7478 * once compression has occured, and this optimistic pre-check avoids
   7479 * spending cycles on compression when there is already no room left
   7480 * or zswap is disabled altogether somewhere in the hierarchy.
   7481 */
   7482bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
   7483{
   7484	struct mem_cgroup *memcg, *original_memcg;
   7485	bool ret = true;
   7486
   7487	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
   7488		return true;
   7489
   7490	original_memcg = get_mem_cgroup_from_objcg(objcg);
   7491	for (memcg = original_memcg; memcg != root_mem_cgroup;
   7492	     memcg = parent_mem_cgroup(memcg)) {
   7493		unsigned long max = READ_ONCE(memcg->zswap_max);
   7494		unsigned long pages;
   7495
   7496		if (max == PAGE_COUNTER_MAX)
   7497			continue;
   7498		if (max == 0) {
   7499			ret = false;
   7500			break;
   7501		}
   7502
   7503		cgroup_rstat_flush(memcg->css.cgroup);
   7504		pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
   7505		if (pages < max)
   7506			continue;
   7507		ret = false;
   7508		break;
   7509	}
   7510	mem_cgroup_put(original_memcg);
   7511	return ret;
   7512}
   7513
   7514/**
   7515 * obj_cgroup_charge_zswap - charge compression backend memory
   7516 * @objcg: the object cgroup
   7517 * @size: size of compressed object
   7518 *
   7519 * This forces the charge after obj_cgroup_may_swap() allowed
   7520 * compression and storage in zwap for this cgroup to go ahead.
   7521 */
   7522void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
   7523{
   7524	struct mem_cgroup *memcg;
   7525
   7526	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
   7527		return;
   7528
   7529	VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
   7530
   7531	/* PF_MEMALLOC context, charging must succeed */
   7532	if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
   7533		VM_WARN_ON_ONCE(1);
   7534
   7535	rcu_read_lock();
   7536	memcg = obj_cgroup_memcg(objcg);
   7537	mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
   7538	mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
   7539	rcu_read_unlock();
   7540}
   7541
   7542/**
   7543 * obj_cgroup_uncharge_zswap - uncharge compression backend memory
   7544 * @objcg: the object cgroup
   7545 * @size: size of compressed object
   7546 *
   7547 * Uncharges zswap memory on page in.
   7548 */
   7549void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
   7550{
   7551	struct mem_cgroup *memcg;
   7552
   7553	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
   7554		return;
   7555
   7556	obj_cgroup_uncharge(objcg, size);
   7557
   7558	rcu_read_lock();
   7559	memcg = obj_cgroup_memcg(objcg);
   7560	mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
   7561	mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
   7562	rcu_read_unlock();
   7563}
   7564
   7565static u64 zswap_current_read(struct cgroup_subsys_state *css,
   7566			      struct cftype *cft)
   7567{
   7568	cgroup_rstat_flush(css->cgroup);
   7569	return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
   7570}
   7571
   7572static int zswap_max_show(struct seq_file *m, void *v)
   7573{
   7574	return seq_puts_memcg_tunable(m,
   7575		READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
   7576}
   7577
   7578static ssize_t zswap_max_write(struct kernfs_open_file *of,
   7579			       char *buf, size_t nbytes, loff_t off)
   7580{
   7581	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
   7582	unsigned long max;
   7583	int err;
   7584
   7585	buf = strstrip(buf);
   7586	err = page_counter_memparse(buf, "max", &max);
   7587	if (err)
   7588		return err;
   7589
   7590	xchg(&memcg->zswap_max, max);
   7591
   7592	return nbytes;
   7593}
   7594
   7595static struct cftype zswap_files[] = {
   7596	{
   7597		.name = "zswap.current",
   7598		.flags = CFTYPE_NOT_ON_ROOT,
   7599		.read_u64 = zswap_current_read,
   7600	},
   7601	{
   7602		.name = "zswap.max",
   7603		.flags = CFTYPE_NOT_ON_ROOT,
   7604		.seq_show = zswap_max_show,
   7605		.write = zswap_max_write,
   7606	},
   7607	{ }	/* terminate */
   7608};
   7609#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
   7610
   7611/*
   7612 * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
   7613 * instead of a core_initcall(), this could mean cgroup_memory_noswap still
   7614 * remains set to false even when memcg is disabled via "cgroup_disable=memory"
   7615 * boot parameter. This may result in premature OOPS inside
   7616 * mem_cgroup_get_nr_swap_pages() function in corner cases.
   7617 */
   7618static int __init mem_cgroup_swap_init(void)
   7619{
   7620	/* No memory control -> no swap control */
   7621	if (mem_cgroup_disabled())
   7622		cgroup_memory_noswap = true;
   7623
   7624	if (cgroup_memory_noswap)
   7625		return 0;
   7626
   7627	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
   7628	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
   7629#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
   7630	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
   7631#endif
   7632	return 0;
   7633}
   7634core_initcall(mem_cgroup_swap_init);
   7635
   7636#endif /* CONFIG_MEMCG_SWAP */