cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

hugetlb_cgroup.c (25155B)


      1/*
      2 *
      3 * Copyright IBM Corporation, 2012
      4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
      5 *
      6 * Cgroup v2
      7 * Copyright (C) 2019 Red Hat, Inc.
      8 * Author: Giuseppe Scrivano <gscrivan@redhat.com>
      9 *
     10 * This program is free software; you can redistribute it and/or modify it
     11 * under the terms of version 2.1 of the GNU Lesser General Public License
     12 * as published by the Free Software Foundation.
     13 *
     14 * This program is distributed in the hope that it would be useful, but
     15 * WITHOUT ANY WARRANTY; without even the implied warranty of
     16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     17 *
     18 */
     19
     20#include <linux/cgroup.h>
     21#include <linux/page_counter.h>
     22#include <linux/slab.h>
     23#include <linux/hugetlb.h>
     24#include <linux/hugetlb_cgroup.h>
     25
     26#define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
     27#define MEMFILE_IDX(val)	(((val) >> 16) & 0xffff)
     28#define MEMFILE_ATTR(val)	((val) & 0xffff)
     29
     30static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
     31
     32static inline struct page_counter *
     33__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
     34				     bool rsvd)
     35{
     36	if (rsvd)
     37		return &h_cg->rsvd_hugepage[idx];
     38	return &h_cg->hugepage[idx];
     39}
     40
     41static inline struct page_counter *
     42hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
     43{
     44	return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
     45}
     46
     47static inline struct page_counter *
     48hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
     49{
     50	return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
     51}
     52
     53static inline
     54struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
     55{
     56	return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
     57}
     58
     59static inline
     60struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
     61{
     62	return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
     63}
     64
     65static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
     66{
     67	return (h_cg == root_h_cgroup);
     68}
     69
     70static inline struct hugetlb_cgroup *
     71parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
     72{
     73	return hugetlb_cgroup_from_css(h_cg->css.parent);
     74}
     75
     76static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
     77{
     78	int idx;
     79
     80	for (idx = 0; idx < hugetlb_max_hstate; idx++) {
     81		if (page_counter_read(
     82				hugetlb_cgroup_counter_from_cgroup(h_cg, idx)))
     83			return true;
     84	}
     85	return false;
     86}
     87
     88static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
     89				struct hugetlb_cgroup *parent_h_cgroup)
     90{
     91	int idx;
     92
     93	for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
     94		struct page_counter *fault_parent = NULL;
     95		struct page_counter *rsvd_parent = NULL;
     96		unsigned long limit;
     97		int ret;
     98
     99		if (parent_h_cgroup) {
    100			fault_parent = hugetlb_cgroup_counter_from_cgroup(
    101				parent_h_cgroup, idx);
    102			rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
    103				parent_h_cgroup, idx);
    104		}
    105		page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
    106								     idx),
    107				  fault_parent);
    108		page_counter_init(
    109			hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
    110			rsvd_parent);
    111
    112		limit = round_down(PAGE_COUNTER_MAX,
    113				   pages_per_huge_page(&hstates[idx]));
    114
    115		ret = page_counter_set_max(
    116			hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
    117			limit);
    118		VM_BUG_ON(ret);
    119		ret = page_counter_set_max(
    120			hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
    121			limit);
    122		VM_BUG_ON(ret);
    123	}
    124}
    125
    126static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
    127{
    128	int node;
    129
    130	for_each_node(node)
    131		kfree(h_cgroup->nodeinfo[node]);
    132	kfree(h_cgroup);
    133}
    134
    135static struct cgroup_subsys_state *
    136hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
    137{
    138	struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
    139	struct hugetlb_cgroup *h_cgroup;
    140	int node;
    141
    142	h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
    143			   GFP_KERNEL);
    144
    145	if (!h_cgroup)
    146		return ERR_PTR(-ENOMEM);
    147
    148	if (!parent_h_cgroup)
    149		root_h_cgroup = h_cgroup;
    150
    151	/*
    152	 * TODO: this routine can waste much memory for nodes which will
    153	 * never be onlined. It's better to use memory hotplug callback
    154	 * function.
    155	 */
    156	for_each_node(node) {
    157		/* Set node_to_alloc to -1 for offline nodes. */
    158		int node_to_alloc =
    159			node_state(node, N_NORMAL_MEMORY) ? node : -1;
    160		h_cgroup->nodeinfo[node] =
    161			kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
    162				     GFP_KERNEL, node_to_alloc);
    163		if (!h_cgroup->nodeinfo[node])
    164			goto fail_alloc_nodeinfo;
    165	}
    166
    167	hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
    168	return &h_cgroup->css;
    169
    170fail_alloc_nodeinfo:
    171	hugetlb_cgroup_free(h_cgroup);
    172	return ERR_PTR(-ENOMEM);
    173}
    174
    175static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
    176{
    177	hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
    178}
    179
    180/*
    181 * Should be called with hugetlb_lock held.
    182 * Since we are holding hugetlb_lock, pages cannot get moved from
    183 * active list or uncharged from the cgroup, So no need to get
    184 * page reference and test for page active here. This function
    185 * cannot fail.
    186 */
    187static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
    188				       struct page *page)
    189{
    190	unsigned int nr_pages;
    191	struct page_counter *counter;
    192	struct hugetlb_cgroup *page_hcg;
    193	struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
    194
    195	page_hcg = hugetlb_cgroup_from_page(page);
    196	/*
    197	 * We can have pages in active list without any cgroup
    198	 * ie, hugepage with less than 3 pages. We can safely
    199	 * ignore those pages.
    200	 */
    201	if (!page_hcg || page_hcg != h_cg)
    202		goto out;
    203
    204	nr_pages = compound_nr(page);
    205	if (!parent) {
    206		parent = root_h_cgroup;
    207		/* root has no limit */
    208		page_counter_charge(&parent->hugepage[idx], nr_pages);
    209	}
    210	counter = &h_cg->hugepage[idx];
    211	/* Take the pages off the local counter */
    212	page_counter_cancel(counter, nr_pages);
    213
    214	set_hugetlb_cgroup(page, parent);
    215out:
    216	return;
    217}
    218
    219/*
    220 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
    221 * the parent cgroup.
    222 */
    223static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
    224{
    225	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
    226	struct hstate *h;
    227	struct page *page;
    228	int idx;
    229
    230	do {
    231		idx = 0;
    232		for_each_hstate(h) {
    233			spin_lock_irq(&hugetlb_lock);
    234			list_for_each_entry(page, &h->hugepage_activelist, lru)
    235				hugetlb_cgroup_move_parent(idx, h_cg, page);
    236
    237			spin_unlock_irq(&hugetlb_lock);
    238			idx++;
    239		}
    240		cond_resched();
    241	} while (hugetlb_cgroup_have_usage(h_cg));
    242}
    243
    244static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
    245				 enum hugetlb_memory_event event)
    246{
    247	atomic_long_inc(&hugetlb->events_local[idx][event]);
    248	cgroup_file_notify(&hugetlb->events_local_file[idx]);
    249
    250	do {
    251		atomic_long_inc(&hugetlb->events[idx][event]);
    252		cgroup_file_notify(&hugetlb->events_file[idx]);
    253	} while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
    254		 !hugetlb_cgroup_is_root(hugetlb));
    255}
    256
    257static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
    258					  struct hugetlb_cgroup **ptr,
    259					  bool rsvd)
    260{
    261	int ret = 0;
    262	struct page_counter *counter;
    263	struct hugetlb_cgroup *h_cg = NULL;
    264
    265	if (hugetlb_cgroup_disabled())
    266		goto done;
    267	/*
    268	 * We don't charge any cgroup if the compound page have less
    269	 * than 3 pages.
    270	 */
    271	if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
    272		goto done;
    273again:
    274	rcu_read_lock();
    275	h_cg = hugetlb_cgroup_from_task(current);
    276	if (!css_tryget(&h_cg->css)) {
    277		rcu_read_unlock();
    278		goto again;
    279	}
    280	rcu_read_unlock();
    281
    282	if (!page_counter_try_charge(
    283		    __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
    284		    nr_pages, &counter)) {
    285		ret = -ENOMEM;
    286		hugetlb_event(h_cg, idx, HUGETLB_MAX);
    287		css_put(&h_cg->css);
    288		goto done;
    289	}
    290	/* Reservations take a reference to the css because they do not get
    291	 * reparented.
    292	 */
    293	if (!rsvd)
    294		css_put(&h_cg->css);
    295done:
    296	*ptr = h_cg;
    297	return ret;
    298}
    299
    300int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
    301				 struct hugetlb_cgroup **ptr)
    302{
    303	return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
    304}
    305
    306int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
    307				      struct hugetlb_cgroup **ptr)
    308{
    309	return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
    310}
    311
    312/* Should be called with hugetlb_lock held */
    313static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
    314					   struct hugetlb_cgroup *h_cg,
    315					   struct page *page, bool rsvd)
    316{
    317	if (hugetlb_cgroup_disabled() || !h_cg)
    318		return;
    319
    320	__set_hugetlb_cgroup(page, h_cg, rsvd);
    321	if (!rsvd) {
    322		unsigned long usage =
    323			h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
    324		/*
    325		 * This write is not atomic due to fetching usage and writing
    326		 * to it, but that's fine because we call this with
    327		 * hugetlb_lock held anyway.
    328		 */
    329		WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
    330			   usage + nr_pages);
    331	}
    332}
    333
    334void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
    335				  struct hugetlb_cgroup *h_cg,
    336				  struct page *page)
    337{
    338	__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false);
    339}
    340
    341void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
    342				       struct hugetlb_cgroup *h_cg,
    343				       struct page *page)
    344{
    345	__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true);
    346}
    347
    348/*
    349 * Should be called with hugetlb_lock held
    350 */
    351static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
    352					   struct page *page, bool rsvd)
    353{
    354	struct hugetlb_cgroup *h_cg;
    355
    356	if (hugetlb_cgroup_disabled())
    357		return;
    358	lockdep_assert_held(&hugetlb_lock);
    359	h_cg = __hugetlb_cgroup_from_page(page, rsvd);
    360	if (unlikely(!h_cg))
    361		return;
    362	__set_hugetlb_cgroup(page, NULL, rsvd);
    363
    364	page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
    365								   rsvd),
    366			      nr_pages);
    367
    368	if (rsvd)
    369		css_put(&h_cg->css);
    370	else {
    371		unsigned long usage =
    372			h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
    373		/*
    374		 * This write is not atomic due to fetching usage and writing
    375		 * to it, but that's fine because we call this with
    376		 * hugetlb_lock held anyway.
    377		 */
    378		WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
    379			   usage - nr_pages);
    380	}
    381}
    382
    383void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
    384				  struct page *page)
    385{
    386	__hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false);
    387}
    388
    389void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages,
    390				       struct page *page)
    391{
    392	__hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true);
    393}
    394
    395static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
    396					     struct hugetlb_cgroup *h_cg,
    397					     bool rsvd)
    398{
    399	if (hugetlb_cgroup_disabled() || !h_cg)
    400		return;
    401
    402	if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
    403		return;
    404
    405	page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
    406								   rsvd),
    407			      nr_pages);
    408
    409	if (rsvd)
    410		css_put(&h_cg->css);
    411}
    412
    413void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
    414				    struct hugetlb_cgroup *h_cg)
    415{
    416	__hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
    417}
    418
    419void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
    420					 struct hugetlb_cgroup *h_cg)
    421{
    422	__hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
    423}
    424
    425void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
    426				     unsigned long end)
    427{
    428	if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
    429	    !resv->css)
    430		return;
    431
    432	page_counter_uncharge(resv->reservation_counter,
    433			      (end - start) * resv->pages_per_hpage);
    434	css_put(resv->css);
    435}
    436
    437void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
    438					 struct file_region *rg,
    439					 unsigned long nr_pages,
    440					 bool region_del)
    441{
    442	if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
    443		return;
    444
    445	if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 &&
    446	    !resv->reservation_counter) {
    447		page_counter_uncharge(rg->reservation_counter,
    448				      nr_pages * resv->pages_per_hpage);
    449		/*
    450		 * Only do css_put(rg->css) when we delete the entire region
    451		 * because one file_region must hold exactly one css reference.
    452		 */
    453		if (region_del)
    454			css_put(rg->css);
    455	}
    456}
    457
    458enum {
    459	RES_USAGE,
    460	RES_RSVD_USAGE,
    461	RES_LIMIT,
    462	RES_RSVD_LIMIT,
    463	RES_MAX_USAGE,
    464	RES_RSVD_MAX_USAGE,
    465	RES_FAILCNT,
    466	RES_RSVD_FAILCNT,
    467};
    468
    469static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
    470{
    471	int nid;
    472	struct cftype *cft = seq_cft(seq);
    473	int idx = MEMFILE_IDX(cft->private);
    474	bool legacy = MEMFILE_ATTR(cft->private);
    475	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
    476	struct cgroup_subsys_state *css;
    477	unsigned long usage;
    478
    479	if (legacy) {
    480		/* Add up usage across all nodes for the non-hierarchical total. */
    481		usage = 0;
    482		for_each_node_state(nid, N_MEMORY)
    483			usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
    484		seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
    485
    486		/* Simply print the per-node usage for the non-hierarchical total. */
    487		for_each_node_state(nid, N_MEMORY)
    488			seq_printf(seq, " N%d=%lu", nid,
    489				   READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
    490					   PAGE_SIZE);
    491		seq_putc(seq, '\n');
    492	}
    493
    494	/*
    495	 * The hierarchical total is pretty much the value recorded by the
    496	 * counter, so use that.
    497	 */
    498	seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
    499		   page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
    500
    501	/*
    502	 * For each node, transverse the css tree to obtain the hierarchical
    503	 * node usage.
    504	 */
    505	for_each_node_state(nid, N_MEMORY) {
    506		usage = 0;
    507		rcu_read_lock();
    508		css_for_each_descendant_pre(css, &h_cg->css) {
    509			usage += READ_ONCE(hugetlb_cgroup_from_css(css)
    510						   ->nodeinfo[nid]
    511						   ->usage[idx]);
    512		}
    513		rcu_read_unlock();
    514		seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
    515	}
    516
    517	seq_putc(seq, '\n');
    518
    519	return 0;
    520}
    521
    522static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
    523				   struct cftype *cft)
    524{
    525	struct page_counter *counter;
    526	struct page_counter *rsvd_counter;
    527	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
    528
    529	counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
    530	rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
    531
    532	switch (MEMFILE_ATTR(cft->private)) {
    533	case RES_USAGE:
    534		return (u64)page_counter_read(counter) * PAGE_SIZE;
    535	case RES_RSVD_USAGE:
    536		return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
    537	case RES_LIMIT:
    538		return (u64)counter->max * PAGE_SIZE;
    539	case RES_RSVD_LIMIT:
    540		return (u64)rsvd_counter->max * PAGE_SIZE;
    541	case RES_MAX_USAGE:
    542		return (u64)counter->watermark * PAGE_SIZE;
    543	case RES_RSVD_MAX_USAGE:
    544		return (u64)rsvd_counter->watermark * PAGE_SIZE;
    545	case RES_FAILCNT:
    546		return counter->failcnt;
    547	case RES_RSVD_FAILCNT:
    548		return rsvd_counter->failcnt;
    549	default:
    550		BUG();
    551	}
    552}
    553
    554static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
    555{
    556	int idx;
    557	u64 val;
    558	struct cftype *cft = seq_cft(seq);
    559	unsigned long limit;
    560	struct page_counter *counter;
    561	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
    562
    563	idx = MEMFILE_IDX(cft->private);
    564	counter = &h_cg->hugepage[idx];
    565
    566	limit = round_down(PAGE_COUNTER_MAX,
    567			   pages_per_huge_page(&hstates[idx]));
    568
    569	switch (MEMFILE_ATTR(cft->private)) {
    570	case RES_RSVD_USAGE:
    571		counter = &h_cg->rsvd_hugepage[idx];
    572		fallthrough;
    573	case RES_USAGE:
    574		val = (u64)page_counter_read(counter);
    575		seq_printf(seq, "%llu\n", val * PAGE_SIZE);
    576		break;
    577	case RES_RSVD_LIMIT:
    578		counter = &h_cg->rsvd_hugepage[idx];
    579		fallthrough;
    580	case RES_LIMIT:
    581		val = (u64)counter->max;
    582		if (val == limit)
    583			seq_puts(seq, "max\n");
    584		else
    585			seq_printf(seq, "%llu\n", val * PAGE_SIZE);
    586		break;
    587	default:
    588		BUG();
    589	}
    590
    591	return 0;
    592}
    593
    594static DEFINE_MUTEX(hugetlb_limit_mutex);
    595
    596static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
    597				    char *buf, size_t nbytes, loff_t off,
    598				    const char *max)
    599{
    600	int ret, idx;
    601	unsigned long nr_pages;
    602	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
    603	bool rsvd = false;
    604
    605	if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
    606		return -EINVAL;
    607
    608	buf = strstrip(buf);
    609	ret = page_counter_memparse(buf, max, &nr_pages);
    610	if (ret)
    611		return ret;
    612
    613	idx = MEMFILE_IDX(of_cft(of)->private);
    614	nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
    615
    616	switch (MEMFILE_ATTR(of_cft(of)->private)) {
    617	case RES_RSVD_LIMIT:
    618		rsvd = true;
    619		fallthrough;
    620	case RES_LIMIT:
    621		mutex_lock(&hugetlb_limit_mutex);
    622		ret = page_counter_set_max(
    623			__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
    624			nr_pages);
    625		mutex_unlock(&hugetlb_limit_mutex);
    626		break;
    627	default:
    628		ret = -EINVAL;
    629		break;
    630	}
    631	return ret ?: nbytes;
    632}
    633
    634static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
    635					   char *buf, size_t nbytes, loff_t off)
    636{
    637	return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
    638}
    639
    640static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
    641					char *buf, size_t nbytes, loff_t off)
    642{
    643	return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
    644}
    645
    646static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
    647				    char *buf, size_t nbytes, loff_t off)
    648{
    649	int ret = 0;
    650	struct page_counter *counter, *rsvd_counter;
    651	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
    652
    653	counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
    654	rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
    655
    656	switch (MEMFILE_ATTR(of_cft(of)->private)) {
    657	case RES_MAX_USAGE:
    658		page_counter_reset_watermark(counter);
    659		break;
    660	case RES_RSVD_MAX_USAGE:
    661		page_counter_reset_watermark(rsvd_counter);
    662		break;
    663	case RES_FAILCNT:
    664		counter->failcnt = 0;
    665		break;
    666	case RES_RSVD_FAILCNT:
    667		rsvd_counter->failcnt = 0;
    668		break;
    669	default:
    670		ret = -EINVAL;
    671		break;
    672	}
    673	return ret ?: nbytes;
    674}
    675
    676static char *mem_fmt(char *buf, int size, unsigned long hsize)
    677{
    678	if (hsize >= (1UL << 30))
    679		snprintf(buf, size, "%luGB", hsize >> 30);
    680	else if (hsize >= (1UL << 20))
    681		snprintf(buf, size, "%luMB", hsize >> 20);
    682	else
    683		snprintf(buf, size, "%luKB", hsize >> 10);
    684	return buf;
    685}
    686
    687static int __hugetlb_events_show(struct seq_file *seq, bool local)
    688{
    689	int idx;
    690	long max;
    691	struct cftype *cft = seq_cft(seq);
    692	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
    693
    694	idx = MEMFILE_IDX(cft->private);
    695
    696	if (local)
    697		max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
    698	else
    699		max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
    700
    701	seq_printf(seq, "max %lu\n", max);
    702
    703	return 0;
    704}
    705
    706static int hugetlb_events_show(struct seq_file *seq, void *v)
    707{
    708	return __hugetlb_events_show(seq, false);
    709}
    710
    711static int hugetlb_events_local_show(struct seq_file *seq, void *v)
    712{
    713	return __hugetlb_events_show(seq, true);
    714}
    715
    716static void __init __hugetlb_cgroup_file_dfl_init(int idx)
    717{
    718	char buf[32];
    719	struct cftype *cft;
    720	struct hstate *h = &hstates[idx];
    721
    722	/* format the size */
    723	mem_fmt(buf, sizeof(buf), huge_page_size(h));
    724
    725	/* Add the limit file */
    726	cft = &h->cgroup_files_dfl[0];
    727	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf);
    728	cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
    729	cft->seq_show = hugetlb_cgroup_read_u64_max;
    730	cft->write = hugetlb_cgroup_write_dfl;
    731	cft->flags = CFTYPE_NOT_ON_ROOT;
    732
    733	/* Add the reservation limit file */
    734	cft = &h->cgroup_files_dfl[1];
    735	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf);
    736	cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT);
    737	cft->seq_show = hugetlb_cgroup_read_u64_max;
    738	cft->write = hugetlb_cgroup_write_dfl;
    739	cft->flags = CFTYPE_NOT_ON_ROOT;
    740
    741	/* Add the current usage file */
    742	cft = &h->cgroup_files_dfl[2];
    743	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf);
    744	cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
    745	cft->seq_show = hugetlb_cgroup_read_u64_max;
    746	cft->flags = CFTYPE_NOT_ON_ROOT;
    747
    748	/* Add the current reservation usage file */
    749	cft = &h->cgroup_files_dfl[3];
    750	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf);
    751	cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE);
    752	cft->seq_show = hugetlb_cgroup_read_u64_max;
    753	cft->flags = CFTYPE_NOT_ON_ROOT;
    754
    755	/* Add the events file */
    756	cft = &h->cgroup_files_dfl[4];
    757	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
    758	cft->private = MEMFILE_PRIVATE(idx, 0);
    759	cft->seq_show = hugetlb_events_show;
    760	cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]);
    761	cft->flags = CFTYPE_NOT_ON_ROOT;
    762
    763	/* Add the events.local file */
    764	cft = &h->cgroup_files_dfl[5];
    765	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf);
    766	cft->private = MEMFILE_PRIVATE(idx, 0);
    767	cft->seq_show = hugetlb_events_local_show;
    768	cft->file_offset = offsetof(struct hugetlb_cgroup,
    769				    events_local_file[idx]);
    770	cft->flags = CFTYPE_NOT_ON_ROOT;
    771
    772	/* Add the numa stat file */
    773	cft = &h->cgroup_files_dfl[6];
    774	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
    775	cft->seq_show = hugetlb_cgroup_read_numa_stat;
    776	cft->flags = CFTYPE_NOT_ON_ROOT;
    777
    778	/* NULL terminate the last cft */
    779	cft = &h->cgroup_files_dfl[7];
    780	memset(cft, 0, sizeof(*cft));
    781
    782	WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
    783				       h->cgroup_files_dfl));
    784}
    785
    786static void __init __hugetlb_cgroup_file_legacy_init(int idx)
    787{
    788	char buf[32];
    789	struct cftype *cft;
    790	struct hstate *h = &hstates[idx];
    791
    792	/* format the size */
    793	mem_fmt(buf, sizeof(buf), huge_page_size(h));
    794
    795	/* Add the limit file */
    796	cft = &h->cgroup_files_legacy[0];
    797	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
    798	cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
    799	cft->read_u64 = hugetlb_cgroup_read_u64;
    800	cft->write = hugetlb_cgroup_write_legacy;
    801
    802	/* Add the reservation limit file */
    803	cft = &h->cgroup_files_legacy[1];
    804	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf);
    805	cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT);
    806	cft->read_u64 = hugetlb_cgroup_read_u64;
    807	cft->write = hugetlb_cgroup_write_legacy;
    808
    809	/* Add the usage file */
    810	cft = &h->cgroup_files_legacy[2];
    811	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
    812	cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
    813	cft->read_u64 = hugetlb_cgroup_read_u64;
    814
    815	/* Add the reservation usage file */
    816	cft = &h->cgroup_files_legacy[3];
    817	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf);
    818	cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE);
    819	cft->read_u64 = hugetlb_cgroup_read_u64;
    820
    821	/* Add the MAX usage file */
    822	cft = &h->cgroup_files_legacy[4];
    823	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
    824	cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
    825	cft->write = hugetlb_cgroup_reset;
    826	cft->read_u64 = hugetlb_cgroup_read_u64;
    827
    828	/* Add the MAX reservation usage file */
    829	cft = &h->cgroup_files_legacy[5];
    830	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf);
    831	cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE);
    832	cft->write = hugetlb_cgroup_reset;
    833	cft->read_u64 = hugetlb_cgroup_read_u64;
    834
    835	/* Add the failcntfile */
    836	cft = &h->cgroup_files_legacy[6];
    837	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
    838	cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
    839	cft->write = hugetlb_cgroup_reset;
    840	cft->read_u64 = hugetlb_cgroup_read_u64;
    841
    842	/* Add the reservation failcntfile */
    843	cft = &h->cgroup_files_legacy[7];
    844	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf);
    845	cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT);
    846	cft->write = hugetlb_cgroup_reset;
    847	cft->read_u64 = hugetlb_cgroup_read_u64;
    848
    849	/* Add the numa stat file */
    850	cft = &h->cgroup_files_legacy[8];
    851	snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
    852	cft->private = MEMFILE_PRIVATE(idx, 1);
    853	cft->seq_show = hugetlb_cgroup_read_numa_stat;
    854
    855	/* NULL terminate the last cft */
    856	cft = &h->cgroup_files_legacy[9];
    857	memset(cft, 0, sizeof(*cft));
    858
    859	WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
    860					  h->cgroup_files_legacy));
    861}
    862
    863static void __init __hugetlb_cgroup_file_init(int idx)
    864{
    865	__hugetlb_cgroup_file_dfl_init(idx);
    866	__hugetlb_cgroup_file_legacy_init(idx);
    867}
    868
    869void __init hugetlb_cgroup_file_init(void)
    870{
    871	struct hstate *h;
    872
    873	for_each_hstate(h) {
    874		/*
    875		 * Add cgroup control files only if the huge page consists
    876		 * of more than two normal pages. This is because we use
    877		 * page[2].private for storing cgroup details.
    878		 */
    879		if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
    880			__hugetlb_cgroup_file_init(hstate_index(h));
    881	}
    882}
    883
    884/*
    885 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
    886 * when we migrate hugepages
    887 */
    888void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
    889{
    890	struct hugetlb_cgroup *h_cg;
    891	struct hugetlb_cgroup *h_cg_rsvd;
    892	struct hstate *h = page_hstate(oldhpage);
    893
    894	if (hugetlb_cgroup_disabled())
    895		return;
    896
    897	spin_lock_irq(&hugetlb_lock);
    898	h_cg = hugetlb_cgroup_from_page(oldhpage);
    899	h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
    900	set_hugetlb_cgroup(oldhpage, NULL);
    901	set_hugetlb_cgroup_rsvd(oldhpage, NULL);
    902
    903	/* move the h_cg details to new cgroup */
    904	set_hugetlb_cgroup(newhpage, h_cg);
    905	set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
    906	list_move(&newhpage->lru, &h->hugepage_activelist);
    907	spin_unlock_irq(&hugetlb_lock);
    908	return;
    909}
    910
    911static struct cftype hugetlb_files[] = {
    912	{} /* terminate */
    913};
    914
    915struct cgroup_subsys hugetlb_cgrp_subsys = {
    916	.css_alloc	= hugetlb_cgroup_css_alloc,
    917	.css_offline	= hugetlb_cgroup_css_offline,
    918	.css_free	= hugetlb_cgroup_css_free,
    919	.dfl_cftypes	= hugetlb_files,
    920	.legacy_cftypes	= hugetlb_files,
    921};