From 028fc57a1c361116e3bcebfeba4ca87878baaf4f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:26 -0700 Subject: drivers/base/memory: introduce "memory groups" to logically group memory blocks In our "auto-movable" memory onlining policy, we want to make decisions across memory blocks of a single memory device. Examples of memory devices include ACPI memory devices (in the simplest case a single DIMM) and virtio-mem. For now, we don't have a connection between a single memory block device and the real memory device. Each memory device consists of 1..X memory block devices. Let's logically group memory blocks belonging to the same memory device in "memory groups". Memory groups can span multiple physical ranges and a memory group itself does not contain any information regarding physical ranges, only properties (e.g., "max_pages") necessary for improved memory onlining. Introduce two memory group types: 1) Static memory group: E.g., a single ACPI memory device, consisting of 1..X memory resources. A memory group consists of 1..Y memory blocks. The whole group is added/removed in one go. If any part cannot get offlined, the whole group cannot be removed. 2) Dynamic memory group: E.g., a single virtio-mem device. Memory is dynamically added/removed in a fixed granularity, called a "unit", consisting of 1..X memory blocks. A unit is added/removed in one go. If any part of a unit cannot get offlined, the whole unit cannot be removed. In case of 1) we usually want either all memory managed by ZONE_MOVABLE or none. In case of 2) we usually want to have as many units as possible managed by ZONE_MOVABLE. We want a single unit to be of the same type. For now, memory groups are an internal concept that is not exposed to user space; we might want to change that in the future, though. add_memory() users can specify a mgid instead of a nid when passing the MHP_NID_IS_MGID flag. Link: https://lkml.kernel.org/r/20210806124715.17090-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) (limited to 'include/linux/memory.h') diff --git a/include/linux/memory.h b/include/linux/memory.h index 97e92e8b556a..d505c12c5c77 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -23,6 +23,42 @@ #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) +/** + * struct memory_group - a logical group of memory blocks + * @nid: The node id for all memory blocks inside the memory group. + * @blocks: List of all memory blocks belonging to this memory group. + * @is_dynamic: The memory group type: static vs. dynamic + * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum + * number of pages we'll have in this static memory group. + * @d.unit_pages: Valid with &memory_group.is_dynamic == true. Unit in pages + * in which memory is added/removed in this dynamic memory group. + * This granularity defines the alignment of a unit in physical + * address space; it has to be at least as big as a single + * memory block. + * + * A memory group logically groups memory blocks; each memory block + * belongs to at most one memory group. A memory group corresponds to + * a memory device, such as a DIMM or a NUMA node, which spans multiple + * memory blocks and might even span multiple non-contiguous physical memory + * ranges. + * + * Modification of members after registration is serialized by memory + * hot(un)plug code. + */ +struct memory_group { + int nid; + struct list_head memory_blocks; + bool is_dynamic; + union { + struct { + unsigned long max_pages; + } s; + struct { + unsigned long unit_pages; + } d; + }; +}; + struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ @@ -34,6 +70,8 @@ struct memory_block { * lay at the beginning of the memory block. */ unsigned long nr_vmemmap_pages; + struct memory_group *group; /* group (if any) for this block */ + struct list_head group_next; /* next block inside memory group */ }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -86,7 +124,8 @@ static inline int memory_notify(unsigned long val, void *v) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages); + unsigned long vmemmap_pages, + struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); @@ -96,6 +135,11 @@ extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< Date: Tue, 7 Sep 2021 19:55:30 -0700 Subject: mm/memory_hotplug: track present pages in memory groups Let's track all present pages in each memory group. Especially, track memory present in ZONE_MOVABLE and memory present in one of the kernel zones (which really only is ZONE_NORMAL right now as memory groups only apply to hotplugged memory) separately within a memory group, to prepare for making smart auto-online decision for individual memory blocks within a memory group based on group statistics. Link: https://lkml.kernel.org/r/20210806124715.17090-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 10 +++++----- include/linux/memory.h | 6 ++++++ include/linux/memory_hotplug.h | 13 +++++++++---- mm/memory_hotplug.c | 19 ++++++++++++++----- 4 files changed, 34 insertions(+), 14 deletions(-) (limited to 'include/linux/memory.h') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 16f5a3610229..a1082013e10c 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -198,7 +198,7 @@ static int memory_block_online(struct memory_block *mem) } ret = online_pages(start_pfn + nr_vmemmap_pages, - nr_pages - nr_vmemmap_pages, zone); + nr_pages - nr_vmemmap_pages, zone, mem->group); if (ret) { if (nr_vmemmap_pages) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); @@ -210,7 +210,7 @@ static int memory_block_online(struct memory_block *mem) * now already properly populated. */ if (nr_vmemmap_pages) - adjust_present_page_count(pfn_to_page(start_pfn), + adjust_present_page_count(pfn_to_page(start_pfn), mem->group, nr_vmemmap_pages); return ret; @@ -228,16 +228,16 @@ static int memory_block_offline(struct memory_block *mem) * can properly be torn down in offline_pages(). */ if (nr_vmemmap_pages) - adjust_present_page_count(pfn_to_page(start_pfn), + adjust_present_page_count(pfn_to_page(start_pfn), mem->group, -nr_vmemmap_pages); ret = offline_pages(start_pfn + nr_vmemmap_pages, - nr_pages - nr_vmemmap_pages); + nr_pages - nr_vmemmap_pages, mem->group); if (ret) { /* offline_pages() failed. Account back. */ if (nr_vmemmap_pages) adjust_present_page_count(pfn_to_page(start_pfn), - nr_vmemmap_pages); + mem->group, nr_vmemmap_pages); return ret; } diff --git a/include/linux/memory.h b/include/linux/memory.h index d505c12c5c77..6ffdc1db385f 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -27,6 +27,10 @@ * struct memory_group - a logical group of memory blocks * @nid: The node id for all memory blocks inside the memory group. * @blocks: List of all memory blocks belonging to this memory group. + * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this + * memory group. + * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this + * memory group. * @is_dynamic: The memory group type: static vs. dynamic * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum * number of pages we'll have in this static memory group. @@ -48,6 +52,8 @@ struct memory_group { int nid; struct list_head memory_blocks; + unsigned long present_kernel_pages; + unsigned long present_movable_pages; bool is_dynamic; union { struct { diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 5d341978b4bc..cf3f423c8a74 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -12,6 +12,7 @@ struct zone; struct pglist_data; struct mem_section; struct memory_block; +struct memory_group; struct resource; struct vmem_altmap; @@ -100,13 +101,15 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -extern void adjust_present_page_count(struct page *page, long nr_pages); +extern void adjust_present_page_count(struct page *page, + struct memory_group *group, + long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, - struct zone *zone); + struct zone *zone, struct memory_group *group); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -296,7 +299,8 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE extern void try_offline_node(int nid); -extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); +extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group); extern int remove_memory(u64 start, u64 size); extern void __remove_memory(u64 start, u64 size); extern int offline_and_remove_memory(u64 start, u64 size); @@ -304,7 +308,8 @@ extern int offline_and_remove_memory(u64 start, u64 size); #else static inline void try_offline_node(int nid) {} -static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group) { return -EINVAL; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd57a296dd27..8199a4f98b2b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -915,9 +915,11 @@ struct zone *zone_for_pfn_range(int online_type, int nid, * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages. */ -void adjust_present_page_count(struct page *page, long nr_pages) +void adjust_present_page_count(struct page *page, struct memory_group *group, + long nr_pages) { struct zone *zone = page_zone(page); + const bool movable = zone_idx(zone) == ZONE_MOVABLE; /* * We only support onlining/offlining/adding/removing of complete @@ -927,6 +929,11 @@ void adjust_present_page_count(struct page *page, long nr_pages) zone->present_early_pages += nr_pages; zone->present_pages += nr_pages; zone->zone_pgdat->node_present_pages += nr_pages; + + if (group && movable) + group->present_movable_pages += nr_pages; + else if (group && !movable) + group->present_kernel_pages += nr_pages; } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, @@ -972,7 +979,8 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group) { unsigned long flags; int need_zonelists_rebuild = 0; @@ -1025,7 +1033,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z } online_pages_range(pfn, nr_pages); - adjust_present_page_count(pfn_to_page(pfn), nr_pages); + adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1769,7 +1777,8 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, return 0; } -int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) +int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn, system_ram_pages = 0; @@ -1905,7 +1914,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - adjust_present_page_count(pfn_to_page(start_pfn), -nr_pages); + adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); -- cgit v1.2.3-71-gd317 From 3fcebf90209a7f52d384ad7701425aa91be309ab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:48 -0700 Subject: mm/memory_hotplug: improved dynamic memory group aware "auto-movable" online policy Currently, the "auto-movable" online policy does not allow for hotplugged KERNEL (ZONE_NORMAL) memory to increase the amount of MOVABLE memory we can have, primarily, because there is no coordiantion across memory devices and we don't want to create zone-imbalances accidentially when unplugging memory. However, within a single memory device it's different. Let's allow for KERNEL memory within a dynamic memory group to allow for more MOVABLE within the same memory group. The only thing we have to take care of is that the managing driver avoids zone imbalances by unplugging MOVABLE memory first, otherwise there can be corner cases where unplug of memory could result in (accidential) zone imbalances. virtio-mem is the only user of dynamic memory groups and recently added support for prioritizing unplug of ZONE_MOVABLE over ZONE_NORMAL, so we don't need a new toggle to enable it for dynamic memory groups. We limit this handling to dynamic memory groups, because: * We want to keep the runtime overhead for collecting stats when onlining a single memory block small. We tend to have only a handful of dynamic memory groups, but we can have quite some static memory groups (e.g., 256 DIMMs). * It doesn't make too much sense for static memory groups, as we try onlining all applicable memory blocks either completely to ZONE_MOVABLE or not. In ordinary operation, we won't have a mixture of zones within a static memory group. When adding memory to a dynamic memory group, we'll first online memory to ZONE_MOVABLE as long as early KERNEL memory allows for it. Then, we'll online the next unit(s) to ZONE_NORMAL, until we can online the next unit(s) to ZONE_MOVABLE. For a simple virtio-mem device with a MOVABLE:KERNEL ratio of 3:1, it will result in a layout like: [M][M][M][M][M][M][M][M][N][M][M][M][N][M][M][M]... ^ movable memory due to early kernel memory ^ allows for more movable memory ... ^-----^ ... here ^ allows for more movable memory ... ^-----^ ... here While the created layout is sub-optimal when it comes to contiguous zones, it gives us the maximum flexibility when dynamically growing/shrinking a device; we can grow small VMs really big in small steps, and still shrink reliably to e.g., 1/4 of the maximum VM size in this example, removing full memory blocks along with meta data more reliably. Mark dynamic memory groups in the xarray such that we can efficiently iterate over them when collecting stats. In usual setups, we have one virtio-mem device per NUMA node, and usually only a small number of NUMA nodes. Note: for now, there seems to be no compelling reason to make this behavior configurable. Link: https://lkml.kernel.org/r/20210806124715.17090-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 30 +++++++++++++++++++++++++ include/linux/memory.h | 3 +++ mm/memory_hotplug.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 89 insertions(+), 4 deletions(-) (limited to 'include/linux/memory.h') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index b699ddc42693..440fd656c002 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -86,6 +86,7 @@ static DEFINE_XARRAY(memory_blocks); * Memory groups, indexed by memory group id (mgid). */ static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); +#define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 static BLOCKING_NOTIFIER_HEAD(memory_chain); @@ -939,6 +940,8 @@ static int memory_group_register(struct memory_group group) if (ret) { kfree(new_group); return ret; + } else if (group.is_dynamic) { + xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); } return mgid; } @@ -1044,3 +1047,30 @@ struct memory_group *memory_group_find_by_id(int mgid) { return xa_load(&memory_groups, mgid); } + +/* + * This is an internal helper only to be used in core memory hotplug code to + * walk all dynamic memory groups excluding a given memory group, either + * belonging to a specific node, or belonging to any node. + */ +int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, + struct memory_group *excluded, void *arg) +{ + struct memory_group *group; + unsigned long index; + int ret = 0; + + xa_for_each_marked(&memory_groups, index, group, + MEMORY_GROUP_MARK_DYNAMIC) { + if (group == excluded) + continue; +#ifdef CONFIG_NUMA + if (nid != NUMA_NO_NODE && group->nid != nid) + continue; +#endif /* CONFIG_NUMA */ + ret = func(group, arg); + if (ret) + break; + } + return ret; +} diff --git a/include/linux/memory.h b/include/linux/memory.h index 6ffdc1db385f..cbcc43ad2b97 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -146,6 +146,9 @@ extern int memory_group_register_static(int nid, unsigned long max_pages); extern int memory_group_register_dynamic(int nid, unsigned long unit_pages); extern int memory_group_unregister(int mgid); struct memory_group *memory_group_find_by_id(int mgid); +typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *); +int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, + struct memory_group *excluded, void *arg); #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 248e2ba4ac59..b80fb8164fb8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -752,11 +752,44 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, #endif /* CONFIG_CMA */ } } +struct auto_movable_group_stats { + unsigned long movable_pages; + unsigned long req_kernel_early_pages; +}; -static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages) +static int auto_movable_stats_account_group(struct memory_group *group, + void *arg) +{ + const int ratio = READ_ONCE(auto_movable_ratio); + struct auto_movable_group_stats *stats = arg; + long pages; + + /* + * We don't support modifying the config while the auto-movable online + * policy is already enabled. Just avoid the division by zero below. + */ + if (!ratio) + return 0; + + /* + * Calculate how many early kernel pages this group requires to + * satisfy the configured zone ratio. + */ + pages = group->present_movable_pages * 100 / ratio; + pages -= group->present_kernel_pages; + + if (pages > 0) + stats->req_kernel_early_pages += pages; + stats->movable_pages += group->present_movable_pages; + return 0; +} + +static bool auto_movable_can_online_movable(int nid, struct memory_group *group, + unsigned long nr_pages) { - struct auto_movable_stats stats = {}; unsigned long kernel_early_pages, movable_pages; + struct auto_movable_group_stats group_stats = {}; + struct auto_movable_stats stats = {}; pg_data_t *pgdat = NODE_DATA(nid); struct zone *zone; int i; @@ -777,6 +810,21 @@ static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages) kernel_early_pages = stats.kernel_early_pages; movable_pages = stats.movable_pages; + /* + * Kernel memory inside dynamic memory group allows for more MOVABLE + * memory within the same group. Remove the effect of all but the + * current group from the stats. + */ + walk_dynamic_memory_groups(nid, auto_movable_stats_account_group, + group, &group_stats); + if (kernel_early_pages <= group_stats.req_kernel_early_pages) + return false; + kernel_early_pages -= group_stats.req_kernel_early_pages; + movable_pages -= group_stats.movable_pages; + + if (group && group->is_dynamic) + kernel_early_pages += group->present_kernel_pages; + /* * Test if we could online the given number of pages to ZONE_MOVABLE * and still stay in the configured ratio. @@ -834,6 +882,10 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * with unmovable allocations). While there are corner cases where it might * still work, it is barely relevant in practice. * + * Exceptions are dynamic memory groups, which allow for more MOVABLE + * memory within the same memory group -- because in that case, there is + * coordination within the single memory device managed by a single driver. + * * We rely on "present pages" instead of "managed pages", as the latter is * highly unreliable and dynamic in virtualized environments, and does not * consider boot time allocations. For example, memory ballooning adjusts the @@ -899,12 +951,12 @@ static struct zone *auto_movable_zone_for_pfn(int nid, * nobody interferes, all will be MOVABLE if possible. */ nr_pages = max_pages - online_pages; - if (!auto_movable_can_online_movable(NUMA_NO_NODE, nr_pages)) + if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages)) goto kernel_zone; #ifdef CONFIG_NUMA if (auto_movable_numa_aware && - !auto_movable_can_online_movable(nid, nr_pages)) + !auto_movable_can_online_movable(nid, group, nr_pages)) goto kernel_zone; #endif /* CONFIG_NUMA */ -- cgit v1.2.3-71-gd317