From cf23422b9d76215316855253da491d4c9f294372 Mon Sep 17 00:00:00 2001 From: minskey guo Date: Mon, 24 May 2010 14:32:41 -0700 Subject: cpu/mem hotplug: enable CPUs online before local memory online Enable users to online CPUs even if the CPUs belongs to a numa node which doesn't have onlined local memory. The zonlists(pg_data_t.node_zonelists[]) of a numa node are created either in system boot/init period, or at the time of local memory online. For a numa node without onlined local memory, its zonelists are not initialized at present. As a result, any memory allocation operations executed by CPUs within this node will fail. In fact, an out-of-memory error is triggered when attempt to online CPUs before memory comes to online. This patch tries to create zonelists for such numa nodes, so that the memory allocation for this node can be fallback'ed to other nodes. [akpm@linux-foundation.org: remove unneeded export] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: minskey guo Cc: Minchan Kim Cc: Yasunori Goto Cc: Andi Kleen Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'mm/memory_hotplug.c') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index be211a582930..85eb4d342ac5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -482,6 +482,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } +/* + * called by cpu_up() to online a node without onlined memory. + */ +int mem_online_node(int nid) +{ + pg_data_t *pgdat; + int ret; + + lock_system_sleep(); + pgdat = hotadd_new_pgdat(nid, 0); + if (pgdat) { + ret = -ENOMEM; + goto out; + } + node_set_online(nid); + ret = register_one_node(nid); + BUG_ON(ret); + +out: + unlock_system_sleep(); + return ret; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { -- cgit v1.2.3-71-gd317 From 1f522509c77a5dea8dc384b735314f03908a6415 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Mon, 24 May 2010 14:32:51 -0700 Subject: mem-hotplug: avoid multiple zones sharing same boot strapping boot_pageset For each new populated zone of hotadded node, need to update its pagesets with dynamically allocated per_cpu_pageset struct for all possible CPUs: 1) Detach zone->pageset from the shared boot_pageset at end of __build_all_zonelists(). 2) Use mutex to protect zone->pageset when it's still shared in onlined_pages() Otherwises, multiple zones of different nodes would share same boot strapping boot_pageset for same CPU, which will finally cause below kernel panic: ------------[ cut here ]------------ kernel BUG at mm/page_alloc.c:1239! invalid opcode: 0000 [#1] SMP ... Call Trace: [] __alloc_pages_nodemask+0x131/0x7b0 [] alloc_pages_current+0x87/0xd0 [] __page_cache_alloc+0x67/0x70 [] __do_page_cache_readahead+0x120/0x260 [] ra_submit+0x21/0x30 [] ondemand_readahead+0x166/0x2c0 [] page_cache_async_readahead+0x80/0xa0 [] generic_file_aio_read+0x364/0x670 [] nfs_file_read+0xca/0x130 [] do_sync_read+0xfa/0x140 [] vfs_read+0xb5/0x1a0 [] sys_read+0x51/0x80 [] system_call_fastpath+0x16/0x1b RIP [] get_page_from_freelist+0x883/0x900 RSP ---[ end trace 4bda28328b9990db ] [akpm@linux-foundation.org: merge fix] Signed-off-by: Haicheng Li Signed-off-by: Wu Fengguang Reviewed-by: Andi Kleen Reviewed-by: Christoph Lameter Cc: Mel Gorman Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- init/main.c | 2 +- kernel/cpu.c | 2 +- mm/memory_hotplug.c | 18 +++++++++++++----- mm/page_alloc.c | 17 +++++++++++++---- 5 files changed, 29 insertions(+), 12 deletions(-) (limited to 'mm/memory_hotplug.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f6f2c505fa7e..a367ed5bb3fe 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -652,7 +652,7 @@ typedef struct pglist_data { void get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free); -void build_all_zonelists(void); +void build_all_zonelists(void *data); void wakeup_kswapd(struct zone *zone, int order); int zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); diff --git a/init/main.c b/init/main.c index 22881b5e95e3..3bdb152f412f 100644 --- a/init/main.c +++ b/init/main.c @@ -567,7 +567,7 @@ asmlinkage void __init start_kernel(void) setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ - build_all_zonelists(); + build_all_zonelists(NULL); page_alloc_init(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); diff --git a/kernel/cpu.c b/kernel/cpu.c index a3fbcc0a0abc..3e8b3ba27175 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -358,7 +358,7 @@ int __cpuinit cpu_up(unsigned int cpu) } if (pgdat->node_zonelists->_zonerefs->zone == NULL) - build_all_zonelists(); + build_all_zonelists(NULL); #endif cpu_maps_update_begin(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 85eb4d342ac5..089cc97aed3c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -389,6 +389,11 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) int nid; int ret; struct memory_notify arg; + /* + * mutex to protect zone->pageset when it's still shared + * in onlined_pages() + */ + static DEFINE_MUTEX(zone_pageset_mutex); arg.start_pfn = pfn; arg.nr_pages = nr_pages; @@ -415,12 +420,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ + mutex_lock(&zone_pageset_mutex); if (!populated_zone(zone)) need_zonelists_rebuild = 1; ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + mutex_unlock(&zone_pageset_mutex); printk(KERN_DEBUG "online_pages %lx at %lx failed\n", nr_pages, pfn); memory_notify(MEM_CANCEL_ONLINE, &arg); @@ -429,8 +436,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; + if (need_zonelists_rebuild) + build_all_zonelists(zone); + else + zone_pcp_update(zone); - zone_pcp_update(zone); + mutex_unlock(&zone_pageset_mutex); setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); if (onlined_pages) { @@ -438,10 +449,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); } - if (need_zonelists_rebuild) - build_all_zonelists(); - else - vm_total_pages = nr_free_pagecache_pages(); + vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 595d0ac211e2..21c52d2d8624 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2572,7 +2572,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) - build_all_zonelists(); + build_all_zonelists(NULL); } out: mutex_unlock(&zl_order_mutex); @@ -2922,9 +2922,10 @@ static void build_zonelist_cache(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static void setup_zone_pageset(struct zone *zone); /* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *dummy) +static __init_refok int __build_all_zonelists(void *data) { int nid; int cpu; @@ -2939,6 +2940,14 @@ static int __build_all_zonelists(void *dummy) build_zonelist_cache(pgdat); } +#ifdef CONFIG_MEMORY_HOTPLUG + /* Setup real pagesets for the new zone */ + if (data) { + struct zone *zone = data; + setup_zone_pageset(zone); + } +#endif + /* * Initialize the boot_pagesets that are going to be used * for bootstrapping processors. The real pagesets for @@ -2958,7 +2967,7 @@ static int __build_all_zonelists(void *dummy) return 0; } -void build_all_zonelists(void) +void build_all_zonelists(void *data) { set_zonelist_order(); @@ -2969,7 +2978,7 @@ void build_all_zonelists(void) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, NULL, NULL); + stop_machine(__build_all_zonelists, data, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- cgit v1.2.3-71-gd317 From 4eaf3f64397c3db3c5785eee508270d62a9fabd9 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Mon, 24 May 2010 14:32:52 -0700 Subject: mem-hotplug: fix potential race while building zonelist for new populated zone Add global mutex zonelists_mutex to fix the possible race: CPU0 CPU1 CPU2 (1) zone->present_pages += online_pages; (2) build_all_zonelists(); (3) alloc_page(); (4) free_page(); (5) build_all_zonelists(); (6) __build_all_zonelists(); (7) zone->pageset = alloc_percpu(); In step (3,4), zone->pageset still points to boot_pageset, so bad things may happen if 2+ nodes are in this state. Even if only 1 node is accessing the boot_pageset, (3) may still consume too much memory to fail the memory allocations in step (7). Besides, atomic operation ensures alloc_percpu() in step (7) will never fail since there is a new fresh memory block added in step(6). [haicheng.li@linux.intel.com: hold zonelists_mutex when build_all_zonelists] Signed-off-by: Haicheng Li Signed-off-by: Wu Fengguang Reviewed-by: Andi Kleen Cc: Christoph Lameter Cc: Mel Gorman Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + kernel/cpu.c | 5 ++++- mm/memory_hotplug.c | 11 +++-------- mm/page_alloc.c | 15 ++++++++++++++- 4 files changed, 22 insertions(+), 10 deletions(-) (limited to 'mm/memory_hotplug.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a367ed5bb3fe..0fa491326c4a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -650,6 +650,7 @@ typedef struct pglist_data { #include +extern struct mutex zonelists_mutex; void get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free); void build_all_zonelists(void *data); diff --git a/kernel/cpu.c b/kernel/cpu.c index 3e8b3ba27175..124ad9d6be16 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -357,8 +357,11 @@ int __cpuinit cpu_up(unsigned int cpu) return -ENOMEM; } - if (pgdat->node_zonelists->_zonerefs->zone == NULL) + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { + mutex_lock(&zonelists_mutex); build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + } #endif cpu_maps_update_begin(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 089cc97aed3c..a4cfcdc00455 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -389,11 +389,6 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) int nid; int ret; struct memory_notify arg; - /* - * mutex to protect zone->pageset when it's still shared - * in onlined_pages() - */ - static DEFINE_MUTEX(zone_pageset_mutex); arg.start_pfn = pfn; arg.nr_pages = nr_pages; @@ -420,14 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ - mutex_lock(&zone_pageset_mutex); + mutex_lock(&zonelists_mutex); if (!populated_zone(zone)) need_zonelists_rebuild = 1; ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { - mutex_unlock(&zone_pageset_mutex); + mutex_unlock(&zonelists_mutex); printk(KERN_DEBUG "online_pages %lx at %lx failed\n", nr_pages, pfn); memory_notify(MEM_CANCEL_ONLINE, &arg); @@ -441,7 +436,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) else zone_pcp_update(zone); - mutex_unlock(&zone_pageset_mutex); + mutex_unlock(&zonelists_mutex); setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); if (onlined_pages) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 21c52d2d8624..08b349931ebc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2571,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write, strncpy((char*)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) + } else if (oldval != user_zonelist_order) { + mutex_lock(&zonelists_mutex); build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + } } out: mutex_unlock(&zl_order_mutex); @@ -2924,6 +2927,12 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static void setup_zone_pageset(struct zone *zone); +/* + * Global mutex to protect against size modification of zonelists + * as well as to serialize pageset setup for the new populated zone. + */ +DEFINE_MUTEX(zonelists_mutex); + /* return values int ....just for stop_machine() */ static __init_refok int __build_all_zonelists(void *data) { @@ -2967,6 +2976,10 @@ static __init_refok int __build_all_zonelists(void *data) return 0; } +/* + * Called with zonelists_mutex held always + * unless system_state == SYSTEM_BOOTING. + */ void build_all_zonelists(void *data) { set_zonelist_order(); -- cgit v1.2.3-71-gd317