From feee6b2989165631b17ac6d4ccdbf6759254e85a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 4 Jan 2020 12:59:33 -0800 Subject: mm/memory_hotplug: shrink zones when offlining memory We currently try to shrink a single zone when removing memory. We use the zone of the first page of the memory we are removing. If that memmap was never initialized (e.g., memory was never onlined), we will read garbage and can trigger kernel BUGs (due to a stale pointer): BUG: unable to handle page fault for address: 000000000000353d #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP PTI CPU: 1 PID: 7 Comm: kworker/u8:0 Not tainted 5.3.0-rc5-next-20190820+ #317 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.4 Workqueue: kacpi_hotplug acpi_hotplug_work_fn RIP: 0010:clear_zone_contiguous+0x5/0x10 Code: 48 89 c6 48 89 c3 e8 2a fe ff ff 48 85 c0 75 cf 5b 5d c3 c6 85 fd 05 00 00 01 5b 5d c3 0f 1f 840 RSP: 0018:ffffad2400043c98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000200000000 RCX: 0000000000000000 RDX: 0000000000200000 RSI: 0000000000140000 RDI: 0000000000002f40 RBP: 0000000140000000 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000140000 R13: 0000000000140000 R14: 0000000000002f40 R15: ffff9e3e7aff3680 FS: 0000000000000000(0000) GS:ffff9e3e7bb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000353d CR3: 0000000058610000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __remove_pages+0x4b/0x640 arch_remove_memory+0x63/0x8d try_remove_memory+0xdb/0x130 __remove_memory+0xa/0x11 acpi_memory_device_remove+0x70/0x100 acpi_bus_trim+0x55/0x90 acpi_device_hotplug+0x227/0x3a0 acpi_hotplug_work_fn+0x1a/0x30 process_one_work+0x221/0x550 worker_thread+0x50/0x3b0 kthread+0x105/0x140 ret_from_fork+0x3a/0x50 Modules linked in: CR2: 000000000000353d Instead, shrink the zones when offlining memory or when onlining failed. Introduce and use remove_pfn_range_from_zone(() for that. We now properly shrink the zones, even if we have DIMMs whereby - Some memory blocks fall into no zone (never onlined) - Some memory blocks fall into multiple zones (offlined+re-onlined) - Multiple memory blocks that fall into different zones Drop the zone parameter (with a potential dubious value) from __remove_pages() and __remove_section(). Link: http://lkml.kernel.org/r/20191006085646.5768-6-david@redhat.com Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") [visible after d0dc12e86b319] Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: "Matthew Wilcox (Oracle)" Cc: "Aneesh Kumar K.V" Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Dan Williams Cc: Logan Gunthorpe Cc: [5.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3a08ecdfca11..ba0dca6aac6e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -122,8 +122,8 @@ static inline bool movable_node_is_enabled(void) extern void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap); -extern void __remove_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); +extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages, + struct vmem_altmap *altmap); /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, @@ -342,6 +342,9 @@ extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); +extern void remove_pfn_range_from_zone(struct zone *zone, + unsigned long start_pfn, + unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -- cgit v1.2.3-71-gd317 From bd5c2344f9eb1ebf7ff2501ddb13d83151939780 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 30 Jan 2020 22:14:54 -0800 Subject: mm/memory_hotplug: pass in nid to online_pages() Patch series "mm/memory_hotplug: pass in nid to online_pages()". Simplify onlining code and get rid of find_memory_block(). Pass in the nid from the memory block we are trying to online directly, instead of manually looking it up. This patch (of 2): No need to lookup the memory block, we can directly pass in the nid. Link: http://lkml.kernel.org/r/20200113113354.6341-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Oscar Salvador Cc: Anshuman Khandual Cc: Dan Williams Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 6 +++--- include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 13 ++----------- 3 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 064643927827..15659306ad69 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -206,7 +206,7 @@ static bool pages_correctly_probed(unsigned long start_pfn) */ static int memory_block_action(unsigned long start_section_nr, unsigned long action, - int online_type) + int online_type, int nid) { unsigned long start_pfn; unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; @@ -219,7 +219,7 @@ memory_block_action(unsigned long start_section_nr, unsigned long action, if (!pages_correctly_probed(start_pfn)) return -EBUSY; - ret = online_pages(start_pfn, nr_pages, online_type); + ret = online_pages(start_pfn, nr_pages, online_type, nid); break; case MEM_OFFLINE: ret = offline_pages(start_pfn, nr_pages); @@ -245,7 +245,7 @@ static int memory_block_change_state(struct memory_block *mem, mem->state = MEM_GOING_OFFLINE; ret = memory_block_action(mem->start_section_nr, to_state, - mem->online_type); + mem->online_type, mem->nid); mem->state = ret ? from_state_req : to_state; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ba0dca6aac6e..ffa6ad12d84a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -94,7 +94,8 @@ extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ -extern int online_pages(unsigned long, unsigned long, int); +extern int online_pages(unsigned long pfn, unsigned long nr_pages, + int online_type, int nid); extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end); extern unsigned long __offline_isolated_pages(unsigned long start_pfn, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c3dc6b27fd6d..36d80915ddc2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -783,27 +783,18 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + int online_type, int nid) { unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; - int nid; int ret; struct memory_notify arg; - struct memory_block *mem; mem_hotplug_begin(); - /* - * We can't use pfn_to_nid() because nid might be stored in struct page - * which is not yet initialized. Instead, we find nid from memory block. - */ - mem = find_memory_block(__pfn_to_section(pfn)); - nid = mem->nid; - put_device(&mem->dev); - /* associate pfn range with the zone */ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); -- cgit v1.2.3-71-gd317 From 92917998849eea951707c8fea2dc3007bb2ad2cd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:26 -0800 Subject: mm/memory_hotplug: drop valid_start/valid_end from test_pages_in_a_zone() The callers are only interested in the actual zone, they don't care about boundaries. Return the zone instead to simplify. Link: http://lkml.kernel.org/r/20200110183308.11849-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 9 ++++----- include/linux/memory_hotplug.h | 4 ++-- mm/memory_hotplug.c | 31 +++++++++---------------------- 3 files changed, 15 insertions(+), 29 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 15659306ad69..b9f474c11393 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -376,7 +376,6 @@ static ssize_t valid_zones_show(struct device *dev, struct memory_block *mem = to_memory_block(dev); unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - unsigned long valid_start_pfn, valid_end_pfn; struct zone *default_zone; int nid; @@ -389,11 +388,11 @@ static ssize_t valid_zones_show(struct device *dev, * The block contains more than one zone can not be offlined. * This can happen e.g. for ZONE_DMA and ZONE_DMA32 */ - if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, - &valid_start_pfn, &valid_end_pfn)) + default_zone = test_pages_in_a_zone(start_pfn, + start_pfn + nr_pages); + if (!default_zone) return sprintf(buf, "none\n"); - start_pfn = valid_start_pfn; - strcat(buf, page_zone(pfn_to_page(start_pfn))->name); + strcat(buf, default_zone->name); goto out; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ffa6ad12d84a..f4d59155f3d4 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -96,8 +96,8 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid); -extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, - unsigned long *valid_start, unsigned long *valid_end); +extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, + unsigned long end_pfn); extern unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4344e85213f2..0a54ffac8c68 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1172,14 +1172,13 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) } /* - * Confirm all pages in a range [start, end) belong to the same zone. - * When true, return its valid [start, end). + * Confirm all pages in a range [start, end) belong to the same zone (skipping + * memory holes). When true, return the zone. */ -int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, - unsigned long *valid_start, unsigned long *valid_end) +struct zone *test_pages_in_a_zone(unsigned long start_pfn, + unsigned long end_pfn) { unsigned long pfn, sec_end_pfn; - unsigned long start, end; struct zone *zone = NULL; struct page *page; int i; @@ -1200,24 +1199,15 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, continue; /* Check if we got outside of the zone */ if (zone && !zone_spans_pfn(zone, pfn + i)) - return 0; + return NULL; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) - return 0; - if (!zone) - start = pfn + i; + return NULL; zone = page_zone(page); - end = pfn + MAX_ORDER_NR_PAGES; } } - if (zone) { - *valid_start = start; - *valid_end = min(end, end_pfn); - return 1; - } else { - return 0; - } + return zone; } /* @@ -1462,7 +1452,6 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long offlined_pages = 0; int ret, node, nr_isolate_pageblock; unsigned long flags; - unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; char *reason; @@ -1487,14 +1476,12 @@ static int __ref __offline_pages(unsigned long start_pfn, /* This makes hotplug much easier...and readable. we assume this for now. .*/ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, - &valid_end)) { + zone = test_pages_in_a_zone(start_pfn, end_pfn); + if (!zone) { ret = -EINVAL; reason = "multizone range"; goto failed_removal; } - - zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); /* set above range as isolated */ -- cgit v1.2.3-71-gd317