From 9da83f3fc74b806ee419a29977ef0239454bd8ec Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 30 Nov 2019 17:50:03 -0800 Subject: mm, memcg: clean up reclaim iter array The mem_cgroup_reclaim_cookie is only used in memcg softlimit reclaim now, and the priority of the reclaim is always 0. We don't need to define the iter in struct mem_cgroup_per_node as an array any more. That could make the code more clear and save some space. Link: http://lkml.kernel.org/r/1569897728-1686-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ae703ea3ef48..2b34925fc19d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,7 +58,6 @@ enum mem_cgroup_protection { struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; - int priority; unsigned int generation; }; @@ -126,7 +125,7 @@ struct mem_cgroup_per_node { unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; - struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; + struct mem_cgroup_reclaim_iter iter; struct memcg_shrinker_map __rcu *shrinker_map; -- cgit v1.2.3-71-gd317 From 242c37b459ce9ea1be53b75bdb76a7d9268a0791 Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sat, 30 Nov 2019 17:50:12 -0800 Subject: include/linux/memcontrol.h: fix comments based on per-node memcg These comments should be updated as memcg limit enforcement has been moved from zones to nodes. Link: http://lkml.kernel.org/r/20191022150618.GA15519@haolee.github.io Signed-off-by: Hao Lee Acked-by: Roman Gushchin Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2b34925fc19d..e82928deea88 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -111,7 +111,7 @@ struct memcg_shrinker_map { }; /* - * per-zone information in memory controller. + * per-node information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; @@ -398,8 +398,7 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) * @memcg: memcg of the wanted lruvec * * Returns the lru list vector holding pages for a given @node or a given - * @memcg and @zone. This can be the node lruvec, if the memory controller - * is disabled. + * @memcg. This can be the node lruvec, if the memory controller is disabled. */ static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, struct mem_cgroup *memcg) -- cgit v1.2.3-71-gd317 From fa40d1ee9f156624658ca409a04a78882ca5b3c5 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sat, 30 Nov 2019 17:50:16 -0800 Subject: mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node() Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration between reclaimers"), the memcg reclaim does not bail out earlier based on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable pages of the memcg on all the nodes will be scanned relative to the reclaim priority. So, there is no need to maintain state regarding which node to start the memcg reclaim from. This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim memory from nodes in round-robin order") and commit 453a9bf347f1 ("memcg: fix numa scan information update to be triggered by memory event"). [shakeelb@google.com: v2] Link: http://lkml.kernel.org/r/20191030204232.139424-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20191029234753.224143-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ---- mm/memcontrol.c | 112 --------------------------------------------- mm/vmscan.c | 14 ++---- 3 files changed, 5 insertions(+), 129 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e82928deea88..239e752a7817 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -80,7 +80,6 @@ struct mem_cgroup_id { enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, MEM_CGROUP_TARGET_SOFTLIMIT, - MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; @@ -312,13 +311,6 @@ struct mem_cgroup { struct list_head kmem_caches; #endif - int last_scanned_node; -#if MAX_NUMNODES > 1 - nodemask_t scan_nodes; - atomic_t numainfo_events; - atomic_t numainfo_updating; -#endif - #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94a5b6d831f9..529e12a59131 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -108,7 +108,6 @@ static const char *const mem_cgroup_lru_names[] = { #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 -#define NUMAINFO_EVENTS_TARGET 1024 /* * Cgroups above their limits are maintained in a RB-Tree, independent of @@ -877,9 +876,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, case MEM_CGROUP_TARGET_SOFTLIMIT: next = val + SOFTLIMIT_EVENTS_TARGET; break; - case MEM_CGROUP_TARGET_NUMAINFO: - next = val + NUMAINFO_EVENTS_TARGET; - break; default: break; } @@ -899,21 +895,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { bool do_softlimit; - bool do_numainfo __maybe_unused; do_softlimit = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); -#if MAX_NUMNODES > 1 - do_numainfo = mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_NUMAINFO); -#endif mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); -#if MAX_NUMNODES > 1 - if (unlikely(do_numainfo)) - atomic_inc(&memcg->numainfo_events); -#endif } } @@ -1591,104 +1578,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, return ret; } -#if MAX_NUMNODES > 1 - -/** - * test_mem_cgroup_node_reclaimable - * @memcg: the target memcg - * @nid: the node ID to be checked. - * @noswap : specify true here if the user wants flle only information. - * - * This function returns whether the specified memcg contains any - * reclaimable pages on a node. Returns true if there are any reclaimable - * pages in the node. - */ -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, - int nid, bool noswap) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); - - if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || - lruvec_page_state(lruvec, NR_ACTIVE_FILE)) - return true; - if (noswap || !total_swap_pages) - return false; - if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || - lruvec_page_state(lruvec, NR_ACTIVE_ANON)) - return true; - return false; - -} - -/* - * Always updating the nodemask is not very good - even if we have an empty - * list or the wrong list here, we can start from some node and traverse all - * nodes based on the zonelist. So update the list loosely once per 10 secs. - * - */ -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) -{ - int nid; - /* - * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET - * pagein/pageout changes since the last update. - */ - if (!atomic_read(&memcg->numainfo_events)) - return; - if (atomic_inc_return(&memcg->numainfo_updating) > 1) - return; - - /* make a nodemask where this memcg uses memory from */ - memcg->scan_nodes = node_states[N_MEMORY]; - - for_each_node_mask(nid, node_states[N_MEMORY]) { - - if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) - node_clear(nid, memcg->scan_nodes); - } - - atomic_set(&memcg->numainfo_events, 0); - atomic_set(&memcg->numainfo_updating, 0); -} - -/* - * Selecting a node where we start reclaim from. Because what we need is just - * reducing usage counter, start from anywhere is O,K. Considering - * memory reclaim from current node, there are pros. and cons. - * - * Freeing memory from current node means freeing memory from a node which - * we'll use or we've used. So, it may make LRU bad. And if several threads - * hit limits, it will see a contention on a node. But freeing from remote - * node means more costs for memory reclaim because of memory latency. - * - * Now, we use round-robin. Better algorithm is welcomed. - */ -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) -{ - int node; - - mem_cgroup_may_update_nodemask(memcg); - node = memcg->last_scanned_node; - - node = next_node_in(node, memcg->scan_nodes); - /* - * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages - * last time it really checked all the LRUs due to rate limiting. - * Fallback to the current node in that case for simplicity. - */ - if (unlikely(node == MAX_NUMNODES)) - node = numa_node_id(); - - memcg->last_scanned_node = node; - return node; -} -#else -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) -{ - return 0; -} -#endif - static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, pg_data_t *pgdat, gfp_t gfp_mask, @@ -5073,7 +4962,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; INIT_WORK(&memcg->high_work, high_work_func); - memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index ee4eecc7e1c2..2beff0e0dc7b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3348,10 +3348,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, gfp_t gfp_mask, bool may_swap) { - struct zonelist *zonelist; unsigned long nr_reclaimed; unsigned long pflags; - int nid; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), @@ -3364,16 +3362,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = may_swap, }; - - set_task_reclaim_state(current, &sc.reclaim_state); /* - * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't - * take care of from where we get pages. So the node where we start the - * scan does not need to be the current node. + * Traverse the ZONELIST_FALLBACK zonelist of the current node to put + * equal pressure on all the nodes. This is based on the assumption that + * the reclaim does not bail out early. */ - nid = mem_cgroup_select_victim_node(memcg); + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); - zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; + set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); -- cgit v1.2.3-71-gd317 From 867e5e1de14b2b2bde324cdfeec3f3f83eb21424 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:34 -0800 Subject: mm: clean up and clarify lruvec lookup procedure There is a per-memcg lruvec and a NUMA node lruvec. Which one is being used is somewhat confusing right now, and it's easy to make mistakes - especially when it comes to global reclaim. How it works: when memory cgroups are enabled, we always use the root_mem_cgroup's per-node lruvecs. When memory cgroups are not compiled in or disabled at runtime, we use pgdat->lruvec. Document that in a comment. Due to the way the reclaim code is generalized, all lookups use the mem_cgroup_lruvec() helper function, and nobody should have to find the right lruvec manually right now. But to avoid future mistakes, rename the pgdat->lruvec member to pgdat->__lruvec and delete the convenience wrapper that suggests it's a commonly accessed member. While in this area, swap the mem_cgroup_lruvec() argument order. The name suggests a memcg operation, yet it takes a pgdat first and a memcg second. I have to double take every time I call this. Fix that. Link: http://lkml.kernel.org/r/20191022144803.302233-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 24 ++++++++++++------------ include/linux/mmzone.h | 15 ++++++++------- mm/memcontrol.c | 10 +++++----- mm/page_alloc.c | 2 +- mm/slab.h | 4 ++-- mm/vmscan.c | 6 +++--- mm/workingset.c | 8 ++++---- 7 files changed, 35 insertions(+), 34 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 239e752a7817..feeb2c76f568 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -385,21 +385,21 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) } /** - * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone - * @node: node of the wanted lruvec + * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec * - * Returns the lru list vector holding pages for a given @node or a given - * @memcg. This can be the node lruvec, if the memory controller is disabled. + * Returns the lru list vector holding pages for a given @memcg & + * @node combination. This can be the node lruvec, if the memory + * controller is disabled. */ -static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, - struct mem_cgroup *memcg) +static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, + struct pglist_data *pgdat) { struct mem_cgroup_per_node *mz; struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = node_lruvec(pgdat); + lruvec = &pgdat->__lruvec; goto out; } @@ -718,7 +718,7 @@ static inline void __mod_lruvec_page_state(struct page *page, return; } - lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup); + lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat); __mod_lruvec_state(lruvec, idx, val); } @@ -889,16 +889,16 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new) { } -static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, - struct mem_cgroup *memcg) +static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, + struct pglist_data *pgdat) { - return node_lruvec(pgdat); + return &pgdat->__lruvec; } static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { - return &pgdat->lruvec; + return &pgdat->__lruvec; } static inline bool mm_match_cgroup(struct mm_struct *mm, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c7fb21f19edd..cc8232a100bd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -777,7 +777,13 @@ typedef struct pglist_data { #endif /* Fields commonly accessed by the page reclaim scanner */ - struct lruvec lruvec; + + /* + * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. + * + * Use mem_cgroup_lruvec() to look up lruvecs. + */ + struct lruvec __lruvec; unsigned long flags; @@ -800,11 +806,6 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) -static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) -{ - return &pgdat->lruvec; -} - static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; @@ -842,7 +843,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #ifdef CONFIG_MEMCG return lruvec->pgdat; #else - return container_of(lruvec, struct pglist_data, lruvec); + return container_of(lruvec, struct pglist_data, __lruvec); #endif } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 529e12a59131..bc01423277c5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -777,7 +777,7 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) if (!memcg || memcg == root_mem_cgroup) { __mod_node_page_state(pgdat, idx, val); } else { - lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); } rcu_read_unlock(); @@ -1221,7 +1221,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = &pgdat->lruvec; + lruvec = &pgdat->__lruvec; goto out; } @@ -3634,7 +3634,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); unsigned long nr = 0; enum lru_list lru; @@ -5338,8 +5338,8 @@ static int mem_cgroup_move_account(struct page *page, anon = PageAnon(page); pgdat = page_pgdat(page); - from_vec = mem_cgroup_lruvec(pgdat, from); - to_vec = mem_cgroup_lruvec(pgdat, to); + from_vec = mem_cgroup_lruvec(from, pgdat); + to_vec = mem_cgroup_lruvec(to, pgdat); spin_lock_irqsave(&from->move_lock, flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3a69ba5ec53..4785a8a2040e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6713,7 +6713,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); - lruvec_init(node_lruvec(pgdat)); + lruvec_init(&pgdat->__lruvec); } static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, diff --git a/mm/slab.h b/mm/slab.h index 8b77f973a6ab..7e94700aa78c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -369,7 +369,7 @@ static __always_inline int memcg_charge_slab(struct page *page, if (ret) goto out; - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); /* transer try_charge() page references to kmem_cache */ @@ -393,7 +393,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, rcu_read_lock(); memcg = READ_ONCE(s->memcg_params.memcg); if (likely(!mem_cgroup_is_root(memcg))) { - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); memcg_kmem_uncharge_memcg(page, order, memcg); } else { diff --git a/mm/vmscan.c b/mm/vmscan.c index 266620f7c814..94d73725813d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2545,7 +2545,7 @@ out: static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, struct scan_control *sc) { - struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -3023,7 +3023,7 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) unsigned long refaults; struct lruvec *lruvec; - lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); lruvec->refaults = refaults; } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); @@ -3379,7 +3379,7 @@ static void age_active_anon(struct pglist_data *pgdat, memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, diff --git a/mm/workingset.c b/mm/workingset.c index c963831d354f..e8212123c1c3 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -233,7 +233,7 @@ void *workingset_eviction(struct page *page) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); eviction = atomic_long_inc_return(&lruvec->inactive_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -280,7 +280,7 @@ void workingset_refault(struct page *page, void *shadow) memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); @@ -345,7 +345,7 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); atomic_long_inc(&lruvec->inactive_age); out: rcu_read_unlock(); @@ -426,7 +426,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; - lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); -- cgit v1.2.3-71-gd317 From 1b05117df78e035afb5f66ef50bf8750d976ef08 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:52 -0800 Subject: mm: vmscan: harmonize writeback congestion tracking for nodes & memcgs The current writeback congestion tracking has separate flags for kswapd reclaim (node level) and cgroup limit reclaim (memcg-node level). This is unnecessarily complicated: the lruvec is an existing abstraction layer for that node-memcg intersection. Introduce lruvec->flags and LRUVEC_CONGESTED. Then track that at the reclaim root level, which is either the NUMA node for global reclaim, or the cgroup-node intersection for cgroup reclaim. Link: http://lkml.kernel.org/r/20191022144803.302233-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++-- include/linux/mmzone.h | 11 ++++-- mm/vmscan.c | 84 ++++++++++++++-------------------------------- 3 files changed, 37 insertions(+), 64 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index feeb2c76f568..5b86287fa069 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -132,9 +132,6 @@ struct mem_cgroup_per_node { unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; - bool congested; /* memcg has many dirty pages */ - /* backed by a congested BDI */ - struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -403,6 +400,9 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, goto out; } + if (!memcg) + memcg = root_mem_cgroup; + mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); lruvec = &mz->lruvec; out: diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cc8232a100bd..ddee00e91a22 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -296,6 +296,12 @@ struct zone_reclaim_stat { unsigned long recent_scanned[2]; }; +enum lruvec_flags { + LRUVEC_CONGESTED, /* lruvec has many dirty pages + * backed by a congested BDI + */ +}; + struct lruvec { struct list_head lists[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; @@ -303,6 +309,8 @@ struct lruvec { atomic_long_t inactive_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults; + /* Various lruvec state flags (enum lruvec_flags) */ + unsigned long flags; #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif @@ -572,9 +580,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum pgdat_flags { - PGDAT_CONGESTED, /* pgdat has many dirty pages backed by - * a congested BDI - */ PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. diff --git a/mm/vmscan.c b/mm/vmscan.c index d35864850b43..39589e561c8f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -267,29 +267,6 @@ static bool writeback_throttling_sane(struct scan_control *sc) #endif return false; } - -static void set_memcg_congestion(pg_data_t *pgdat, - struct mem_cgroup *memcg, - bool congested) -{ - struct mem_cgroup_per_node *mn; - - if (!memcg) - return; - - mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - WRITE_ONCE(mn->congested, congested); -} - -static bool memcg_congested(pg_data_t *pgdat, - struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *mn; - - mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - return READ_ONCE(mn->congested); - -} #else static int prealloc_memcg_shrinker(struct shrinker *shrinker) { @@ -309,18 +286,6 @@ static bool writeback_throttling_sane(struct scan_control *sc) { return true; } - -static inline void set_memcg_congestion(struct pglist_data *pgdat, - struct mem_cgroup *memcg, bool congested) -{ -} - -static inline bool memcg_congested(struct pglist_data *pgdat, - struct mem_cgroup *memcg) -{ - return false; - -} #endif /* @@ -2716,12 +2681,6 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction; } -static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) -{ - return test_bit(PGDAT_CONGESTED, &pgdat->flags) || - (memcg && memcg_congested(pgdat, memcg)); -} - static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; @@ -2783,10 +2742,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; - struct mem_cgroup *target_memcg = sc->target_mem_cgroup; unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; bool reclaimable = false; + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + again: memset(&sc->nr, 0, sizeof(sc->nr)); @@ -2801,7 +2762,7 @@ again: } /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, target_memcg, true, + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); @@ -2829,14 +2790,6 @@ again: if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); - /* - * Tag a node as congested if all the dirty pages - * scanned were backed by a congested BDI and - * wait_iff_congested will stall. - */ - if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); - /* Allow kswapd to start writing pages during reclaim.*/ if (sc->nr.unqueued_dirty == sc->nr.file_taken) set_bit(PGDAT_DIRTY, &pgdat->flags); @@ -2852,12 +2805,17 @@ again: } /* + * Tag a node/memcg as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. + * * Legacy memcg will stall in page writeback so avoid forcibly * stalling in wait_iff_congested(). */ - if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) && + if ((current_is_kswapd() || + (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_memcg_congestion(pgdat, target_memcg, true); + set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); /* * Stall direct reclaim for IO completions if underlying BDIs @@ -2865,9 +2823,9 @@ again: * starts encountering unqueued dirty pages or cycling through * the LRU too quickly. */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle() && - pgdat_memcg_congested(pgdat, target_memcg)) + if (!current_is_kswapd() && current_may_throttle() && + !sc->hibernation_mode && + test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) wait_iff_congested(BLK_RW_ASYNC, HZ/10); if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, @@ -3081,8 +3039,16 @@ retry: if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; + snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); - set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false); + + if (cgroup_reclaim(sc)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, + zone->zone_pgdat); + clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + } } delayacct_freepages_end(); @@ -3450,7 +3416,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) /* Clear pgdat state for congested, dirty or under writeback. */ static void clear_pgdat_congested(pg_data_t *pgdat) { - clear_bit(PGDAT_CONGESTED, &pgdat->flags); + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); + + clear_bit(LRUVEC_CONGESTED, &lruvec->flags); clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } -- cgit v1.2.3-71-gd317 From b910718a948a9120d90faf632b33ed23c70e266a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:59 -0800 Subject: mm: vmscan: detect file thrashing at the reclaim root We use refault information to determine whether the cache workingset is stable or transitioning, and dynamically adjust the inactive:active file LRU ratio so as to maximize protection from one-off cache during stable periods, and minimize IO during transitions. With cgroups and their nested LRU lists, we currently don't do this correctly. While recursive cgroup reclaim establishes a relative LRU order among the pages of all involved cgroups, refaults only affect the local LRU order in the cgroup in which they are occuring. As a result, cache transitions can take longer in a cgrouped system as the active pages of sibling cgroups aren't challenged when they should be. [ Right now, this is somewhat theoretical, because the siblings, under continued regular reclaim pressure, should eventually run out of inactive pages - and since inactive:active *size* balancing is also done on a cgroup-local level, we will challenge the active pages eventually in most cases. But the next patch will move that relative size enforcement to the reclaim root as well, and then this patch here will be necessary to propagate refault pressure to siblings. ] This patch moves refault detection to the root of reclaim. Instead of remembering the cgroup owner of an evicted page, remember the cgroup that caused the reclaim to happen. When refaults later occur, they'll correctly influence the cross-cgroup LRU order that reclaim follows. I.e. if global reclaim kicked out pages in some subgroup A/B/C, the refault of those pages will challenge the global LRU order, and not just the local order down inside C. [hannes@cmpxchg.org: use page_memcg() instead of another lookup] Link: http://lkml.kernel.org/r/20191115160722.GA309754@cmpxchg.org Link: http://lkml.kernel.org/r/20191107205334.158354-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Suren Baghdasaryan Cc: Andrey Ryabinin Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++++ include/linux/swap.h | 2 +- mm/vmscan.c | 32 +++++++++++----------- mm/workingset.c | 67 +++++++++++++++++++++++++++++++++++----------- 4 files changed, 73 insertions(+), 33 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5b86287fa069..a7a0a1a5c8d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -901,6 +901,11 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, return &pgdat->__lruvec; } +static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) +{ + return NULL; +} + static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { diff --git a/include/linux/swap.h b/include/linux/swap.h index 063c0c1e112b..1e99f7ac1d7e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -307,7 +307,7 @@ struct vma_swap_readahead { }; /* linux/mm/workingset.c */ -void *workingset_eviction(struct page *page); +void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 725b5d4784f7..39657012e2d8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -853,7 +853,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct page *page, - bool reclaimed) + bool reclaimed, struct mem_cgroup *target_memcg) { unsigned long flags; int refcount; @@ -925,7 +925,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) - shadow = workingset_eviction(page); + shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); xa_unlock_irqrestore(&mapping->i_pages, flags); @@ -948,7 +948,7 @@ cannot_free: */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page, false)) { + if (__remove_mapping(mapping, page, false, NULL)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -1426,7 +1426,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); - } else if (!mapping || !__remove_mapping(mapping, page, true)) + } else if (!mapping || !__remove_mapping(mapping, page, true, + sc->target_mem_cgroup)) goto keep_locked; unlock_page(page); @@ -2189,6 +2190,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, enum lru_list inactive_lru = file * LRU_FILE; unsigned long inactive, active; unsigned long inactive_ratio; + struct lruvec *target_lruvec; unsigned long refaults; unsigned long gb; @@ -2200,8 +2202,9 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, * is being established. Disable active list protection to get * rid of the stale workingset quickly. */ - refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); - if (file && lruvec->refaults != refaults) { + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE); + if (file && target_lruvec->refaults != refaults) { inactive_ratio = 0; } else { gb = (inactive + active) >> (30 - PAGE_SHIFT); @@ -2973,19 +2976,14 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask = orig_mask; } -static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) +static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_iter(root_memcg, NULL, NULL); - do { - unsigned long refaults; - struct lruvec *lruvec; + struct lruvec *target_lruvec; + unsigned long refaults; - lruvec = mem_cgroup_lruvec(memcg, pgdat); - refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); - lruvec->refaults = refaults; - } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE); + target_lruvec->refaults = refaults; } /* diff --git a/mm/workingset.c b/mm/workingset.c index e8212123c1c3..474186b76ced 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -213,28 +213,53 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *workingsetp = workingset; } +static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + /* + * Reclaiming a cgroup means reclaiming all its children in a + * round-robin fashion. That means that each cgroup has an LRU + * order that is composed of the LRU orders of its child + * cgroups; and every page has an LRU position not just in the + * cgroup that owns it, but in all of that group's ancestors. + * + * So when the physical inactive list of a leaf cgroup ages, + * the virtual inactive lists of all its parents, including + * the root cgroup's, age as well. + */ + do { + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + atomic_long_inc(&lruvec->inactive_age); + } while (memcg && (memcg = parent_mem_cgroup(memcg))); +} + /** * workingset_eviction - note the eviction of a page from memory + * @target_memcg: the cgroup that is causing the reclaim * @page: the page being evicted * * Returns a shadow entry to be stored in @page->mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ -void *workingset_eviction(struct page *page) +void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = page_pgdat(page); - struct mem_cgroup *memcg = page_memcg(page); - int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; + int memcgid; /* Page is fully exclusive and pins page->mem_cgroup */ VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - eviction = atomic_long_inc_return(&lruvec->inactive_age); + advance_inactive_age(page_memcg(page), pgdat); + + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->inactive_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -244,10 +269,13 @@ void *workingset_eviction(struct page *page) * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously - * evicted page in the context of the node it was allocated in. + * evicted page in the context of the node and the memcg whose memory + * pressure caused the eviction. */ void workingset_refault(struct page *page, void *shadow) { + struct mem_cgroup *eviction_memcg; + struct lruvec *eviction_lruvec; unsigned long refault_distance; struct pglist_data *pgdat; unsigned long active_file; @@ -277,12 +305,12 @@ void workingset_refault(struct page *page, void *shadow) * would be better if the root_mem_cgroup existed in all * configurations instead. */ - memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) + eviction_memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !eviction_memcg) goto out; - lruvec = mem_cgroup_lruvec(memcg, pgdat); - refault = atomic_long_read(&lruvec->inactive_age); - active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); + refault = atomic_long_read(&eviction_lruvec->inactive_age); + active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); /* * Calculate the refault distance @@ -302,6 +330,17 @@ void workingset_refault(struct page *page, void *shadow) */ refault_distance = (refault - eviction) & EVICTION_MASK; + /* + * The activation decision for this page is made at the level + * where the eviction occurred, as that is where the LRU order + * during page reclaim is being determined. + * + * However, the cgroup that will own the page is the one that + * is actually experiencing the refault event. + */ + memcg = page_memcg(page); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + inc_lruvec_state(lruvec, WORKINGSET_REFAULT); /* @@ -313,7 +352,7 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); - atomic_long_inc(&lruvec->inactive_age); + advance_inactive_age(memcg, pgdat); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); /* Page was active prior to eviction */ @@ -332,7 +371,6 @@ out: void workingset_activation(struct page *page) { struct mem_cgroup *memcg; - struct lruvec *lruvec; rcu_read_lock(); /* @@ -345,8 +383,7 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); - atomic_long_inc(&lruvec->inactive_age); + advance_inactive_age(memcg, page_pgdat(page)); out: rcu_read_unlock(); } -- cgit v1.2.3-71-gd317