From 84c89557a302e18414a011cc52b1abd034860743 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Thu, 13 Jan 2011 19:59:47 +0000 Subject: dm ioctl: allow rename to fill empty uuid Allow the uuid of a mapped device to be set after device creation. Previously the uuid (which is optional) could only be set by DM_DEV_CREATE. If no uuid was supplied it could not be set later. Sometimes it's necessary to create the device before the uuid is known, and in such cases the uuid must be filled in after the creation. This patch extends DM_DEV_RENAME to accept a uuid accompanied by a new flag DM_UUID_FLAG. This can only be done once and if no uuid was previously supplied. It cannot be used to change an existing uuid. DM_VERSION_MINOR is also bumped to 19 to indicate this interface extension is available. Signed-off-by: Peter Jones Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- include/linux/dm-ioctl.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 49eab360d5d4..e453e1174ae1 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -44,7 +44,7 @@ * Remove a device, destroy any tables. * * DM_DEV_RENAME: - * Rename a device. + * Rename a device or set its uuid if none was previously supplied. * * DM_SUSPEND: * This performs both suspend and resume, depending which flag is @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 18 +#define DM_VERSION_MINOR 19 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2010-06-29)" +#define DM_VERSION_EXTRA "-ioctl (2010-10-14)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -322,4 +322,10 @@ enum { */ #define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */ +/* + * If set, rename changes the uuid not the name. Only permitted + * if no uuid was previously supplied: an existing uuid cannot be changed. + */ +#define DM_UUID_FLAG (1 << 14) /* In */ + #endif /* _LINUX_DM_IOCTL_H */ -- cgit v1.2.3-71-gd317 From 86a54a4802df10d23ccd655e2083e812fe990243 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 13 Jan 2011 19:59:52 +0000 Subject: dm log userspace: add version number to comms This patch adds a 'version' field to the 'dm_ulog_request' structure. The 'version' field is taken from a portion of the unused 'padding' field in the 'dm_ulog_request' structure. This was done to avoid changing the size of the structure and possibly disrupting backwards compatibility. The version number will help notify user-space daemons when a change has been made to the kernel/userspace log API. Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log-userspace-base.c | 6 ++++-- drivers/md/dm-log-userspace-transfer.c | 1 + include/linux/dm-log-userspace.h | 13 ++++++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 31e1687e7bf6..aa2e0c374ab3 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -12,6 +12,8 @@ #include "dm-log-userspace-transfer.h" +#define DM_LOG_USERSPACE_VSN "1.1.0" + struct flush_entry { int type; region_t region; @@ -765,7 +767,7 @@ static int __init userspace_dirty_log_init(void) return r; } - DMINFO("version 1.0.0 loaded"); + DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); return 0; } @@ -775,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void) dm_ulog_tfr_exit(); mempool_destroy(flush_entry_pool); - DMINFO("version 1.0.0 unloaded"); + DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); return; } diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 075cbcf8a9f5..049eaf12aaab 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -198,6 +198,7 @@ resend: memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); memcpy(tfr->uuid, uuid, DM_UUID_LEN); + tfr->version = DM_ULOG_REQUEST_VERSION; tfr->luid = luid; tfr->seq = dm_ulog_seq++; diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h index 0c3c3a2110c4..eeace7d3ff15 100644 --- a/include/linux/dm-log-userspace.h +++ b/include/linux/dm-log-userspace.h @@ -370,6 +370,16 @@ #define DM_ULOG_REQUEST_TYPE(request_type) \ (DM_ULOG_REQUEST_MASK & (request_type)) +/* + * DM_ULOG_REQUEST_VERSION is incremented when there is a + * change to the way information is passed between kernel + * and userspace. This could be a structure change of + * dm_ulog_request or a change in the way requests are + * issued/handled. Changes are outlined here: + * version 1: Initial implementation + */ +#define DM_ULOG_REQUEST_VERSION 1 + struct dm_ulog_request { /* * The local unique identifier (luid) and the universally unique @@ -383,8 +393,9 @@ struct dm_ulog_request { */ uint64_t luid; char uuid[DM_UUID_LEN]; - char padding[7]; /* Padding because DM_UUID_LEN = 129 */ + char padding[3]; /* Padding because DM_UUID_LEN = 129 */ + uint32_t version; /* See DM_ULOG_REQUEST_VERSION */ int32_t error; /* Used to report back processing errors */ uint32_t seq; /* Sequence number for request */ -- cgit v1.2.3-71-gd317 From 9c4376de98719d2768dd919553843de34bb094a6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jan 2011 19:59:58 +0000 Subject: dm: use non reentrant workqueues if equivalent kmirrord_wq, kcopyd_work and md->wq are created per dm instance and serve only a single work item from the dm instance, so non-reentrant workqueues would provide the same ordering guarantees as ordered ones while allowing CPU affinity and use of the workqueues for other purposes. Switch them to non-reentrant workqueues. Signed-off-by: Tejun Heo Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-kcopyd.c | 3 ++- drivers/md/dm-raid1.c | 5 +++-- drivers/md/dm.c | 3 ++- include/linux/dm-ioctl.h | 4 ++-- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 63d67169c7f4..924f5f0084c2 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -672,7 +672,8 @@ int dm_kcopyd_client_create(unsigned int nr_pages, goto bad_slab; INIT_WORK(&kc->kcopyd_work, do_work); - kc->kcopyd_wq = alloc_ordered_workqueue("kcopyd", WQ_MEM_RECLAIM); + kc->kcopyd_wq = alloc_workqueue("kcopyd", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!kc->kcopyd_wq) goto bad_workqueue; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 39917430f9f8..dee326775c60 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1085,7 +1085,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_requests = 1; ti->num_discard_requests = 1; - ms->kmirrord_wq = alloc_ordered_workqueue("kmirrord", WQ_MEM_RECLAIM); + ms->kmirrord_wq = alloc_workqueue("kmirrord", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!ms->kmirrord_wq) { DMERR("couldn't start kmirrord"); r = -ENOMEM; @@ -1414,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 12, 0}, + .version = {1, 12, 1}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 39aaa9290128..e504bb40d60e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1883,7 +1883,8 @@ static struct mapped_device *alloc_dev(int minor) add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); - md->wq = alloc_ordered_workqueue("kdmflush", WQ_MEM_RECLAIM); + md->wq = alloc_workqueue("kdmflush", + WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); if (!md->wq) goto bad_thread; diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index e453e1174ae1..78bbf47bbb96 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -268,8 +268,8 @@ enum { #define DM_VERSION_MAJOR 4 #define DM_VERSION_MINOR 19 -#define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2010-10-14)" +#define DM_VERSION_PATCHLEVEL 1 +#define DM_VERSION_EXTRA "-ioctl (2011-01-07)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3-71-gd317 From 9d357b0787bb3c91835d5e658c3bda178f9ca419 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Jan 2011 20:00:01 +0000 Subject: dm: introduce target callbacks and congestion callback DM currently implements congestion checking by checking on congestion in each component device. For raid456 we need to also check if the stripe cache is congested. Add per-target congestion checker callback support. Extending the target_callbacks structure with additional callback functions allows for establishing multiple callbacks per-target (a callback is also needed for unplug). Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-table.c | 14 ++++++++++++++ include/linux/device-mapper.h | 11 +++++++++++ 2 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 985c20a4f30e..7e2ec3c05550 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -71,6 +71,8 @@ struct dm_table { void *event_context; struct dm_md_mempools *mempools; + + struct list_head target_callbacks; }; /* @@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, return -ENOMEM; INIT_LIST_HEAD(&t->devices); + INIT_LIST_HEAD(&t->target_callbacks); atomic_set(&t->holders, 0); t->discards_supported = 1; @@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t) return 0; } +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) +{ + list_add(&cb->list, &t->target_callbacks); +} +EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); + int dm_table_any_congested(struct dm_table *t, int bdi_bits) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; int r = 0; list_for_each_entry(dd, devices, list) { @@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) bdevname(dd->dm_dev.bdev, b)); } + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->congested_fn) + r |= cb->congested_fn(cb, bdi_bits); + return r; } diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 2970022faa63..4b1c63d478ab 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -193,6 +193,12 @@ struct dm_target { char *error; }; +/* Each target can link one of these into the table */ +struct dm_target_callbacks { + struct list_head list; + int (*congested_fn) (struct dm_target_callbacks *, int); +}; + int dm_register_target(struct target_type *t); void dm_unregister_target(struct target_type *t); @@ -268,6 +274,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode, int dm_table_add_target(struct dm_table *t, const char *type, sector_t start, sector_t len, char *params); +/* + * Target_ctr should call this if it needs to add any callbacks. + */ +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb); + /* * Finally call this to make the table ready for use. */ -- cgit v1.2.3-71-gd317 From 99d03c141b40914b67d63c9d23b8da4386422ed7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Jan 2011 20:00:02 +0000 Subject: dm: per target unplug callback support Add per-target unplug callback support. Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-table.c | 5 +++++ include/linux/device-mapper.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 7e2ec3c05550..dffa0ac7c4f0 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1278,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t) { struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); @@ -1290,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t) dm_device_name(t->md), bdevname(dd->dm_dev.bdev, b)); } + + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->unplug_fn) + cb->unplug_fn(cb); } struct mapped_device *dm_table_get_md(struct dm_table *t) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 4b1c63d478ab..272496d1fae4 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -197,6 +197,7 @@ struct dm_target { struct dm_target_callbacks { struct list_head list; int (*congested_fn) (struct dm_target_callbacks *, int); + void (*unplug_fn)(struct dm_target_callbacks *); }; int dm_register_target(struct target_type *t); -- cgit v1.2.3-71-gd317 From d8a3515e2a9523f8ed56d1f4537d16338bda2bc2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 13 Jan 2011 17:26:46 -0800 Subject: Revert "gpiolib: annotate gpio-intialization with __must_check" This reverts commit 0fdae42d361bbb431ca0ab0efed5126a94821177, which wasn't really supposed to go in, and causes lots of annoying warnings. Quoth Andrew: "Complete brainfart - I meant to drop that patch ages ago." Quoth Greg: "Ick, yeah, that patch isn't ok to go in as-is, all of the callers need to be fixed up first, which is what I thought we had agreed on..." Reported-by: Stephen Rothwell Acked-by: Andrew Morton Acked-by: Greg KH Signed-off-by: Linus Torvalds --- Documentation/gpio.txt | 2 +- include/asm-generic/gpio.h | 10 +++++----- include/linux/gpio.h | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt index a492d92bb098..792faa3c06cf 100644 --- a/Documentation/gpio.txt +++ b/Documentation/gpio.txt @@ -135,7 +135,7 @@ setting up a platform_device using the GPIO, is mark its direction: int gpio_direction_input(unsigned gpio); int gpio_direction_output(unsigned gpio, int value); -The return value is zero for success, else a negative errno. It must +The return value is zero for success, else a negative errno. It should be checked, since the get/set calls don't have error returns and since misconfiguration is possible. You should normally issue these calls from a task context. However, for spinlock-safe GPIOs it's OK to use them diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h index 6098cae2af8e..ff5c66080c8c 100644 --- a/include/asm-generic/gpio.h +++ b/include/asm-generic/gpio.h @@ -147,11 +147,11 @@ extern struct gpio_chip *gpiochip_find(void *data, /* Always use the library code for GPIO management calls, * or when sleeping may be involved. */ -extern int __must_check gpio_request(unsigned gpio, const char *label); +extern int gpio_request(unsigned gpio, const char *label); extern void gpio_free(unsigned gpio); -extern int __must_check gpio_direction_input(unsigned gpio); -extern int __must_check gpio_direction_output(unsigned gpio, int value); +extern int gpio_direction_input(unsigned gpio); +extern int gpio_direction_output(unsigned gpio, int value); extern int gpio_set_debounce(unsigned gpio, unsigned debounce); @@ -192,8 +192,8 @@ struct gpio { const char *label; }; -extern int __must_check gpio_request_one(unsigned gpio, unsigned long flags, const char *label); -extern int __must_check gpio_request_array(struct gpio *array, size_t num); +extern int gpio_request_one(unsigned gpio, unsigned long flags, const char *label); +extern int gpio_request_array(struct gpio *array, size_t num); extern void gpio_free_array(struct gpio *array, size_t num); #ifdef CONFIG_GPIO_SYSFS diff --git a/include/linux/gpio.h b/include/linux/gpio.h index f79d67f413e4..4b47ed96f131 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -30,7 +30,7 @@ static inline int gpio_is_valid(int number) return 0; } -static inline int __must_check gpio_request(unsigned gpio, const char *label) +static inline int gpio_request(unsigned gpio, const char *label) { return -ENOSYS; } @@ -62,12 +62,12 @@ static inline void gpio_free_array(struct gpio *array, size_t num) WARN_ON(1); } -static inline int __must_check gpio_direction_input(unsigned gpio) +static inline int gpio_direction_input(unsigned gpio) { return -ENOSYS; } -static inline int __must_check gpio_direction_output(unsigned gpio, int value) +static inline int gpio_direction_output(unsigned gpio, int value) { return -ENOSYS; } -- cgit v1.2.3-71-gd317 From 6c9ae009b298753a3baf71298d676a68b5a10c8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Jan 2011 15:45:38 -0800 Subject: irq: use per_cpu kstat_irqs Use modern per_cpu API to increment {soft|hard}irq counters, and use per_cpu allocation for (struct irq_desc)->kstats_irq instead of an array. This gives better SMP/NUMA locality and saves few instructions per irq. With small nr_cpuids values (8 for example), kstats_irq was a small array (less than L1_CACHE_BYTES), potentially source of false sharing. In the !CONFIG_SPARSE_IRQ case, remove the huge, NUMA/cache unfriendly kstat_irqs_all[NR_IRQS][NR_CPUS] array. Note: we still populate kstats_irq for all possible irqs in early_irq_init(). We probably could use on-demand allocations. (Code included in alloc_descs()). Problem is not all IRQS are used with a prior alloc_descs() call. kstat_irqs_this_cpu() is not used anymore, remove it. Signed-off-by: Eric Dumazet Reviewed-by: Christoph Lameter Cc: Ingo Molnar Cc: Andi Kleen Cc: Tejun Heo Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/irqdesc.h | 2 +- include/linux/kernel_stat.h | 19 +++++++++---------- kernel/irq/irqdesc.c | 40 ++++++++++++++++++++++++++++++---------- 3 files changed, 40 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 979c68cc7458..6a64c6fa81af 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -57,7 +57,7 @@ struct irq_desc { #endif struct timer_rand_state *timer_rand_state; - unsigned int *kstat_irqs; + unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ unsigned int status; /* IRQ status */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44e83ba12b5b..0cce2db580c3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); #ifndef CONFIG_GENERIC_HARDIRQS -#define kstat_irqs_this_cpu(irq) \ - (this_cpu_read(kstat.irqs[irq]) struct irq_desc; static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) { - kstat_this_cpu.irqs[irq]++; - kstat_this_cpu.irqs_sum++; + __this_cpu_inc(kstat.irqs[irq]); + __this_cpu_inc(kstat.irqs_sum); } static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) @@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) #else #include extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); -#define kstat_irqs_this_cpu(DESC) \ - ((DESC)->kstat_irqs[smp_processor_id()]) -#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\ - ((DESC)->kstat_irqs[smp_processor_id()]++);\ - kstat_this_cpu.irqs_sum++; } while (0) + +#define kstat_incr_irqs_this_cpu(irqno, DESC) \ +do { \ + __this_cpu_inc(*(DESC)->kstat_irqs); \ + __this_cpu_inc(kstat.irqs_sum); \ +} while (0) #endif static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) { - kstat_this_cpu.softirqs[irq]++; + __this_cpu_inc(kstat.softirqs[irq]); } static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03797f5..282f20230e67 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) { + int cpu; + desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; @@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_count = 0; desc->irqs_unhandled = 0; desc->name = NULL; - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + for_each_possible_cpu(cpu) + *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; desc_smp_init(desc, node); } @@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node) if (!desc) return NULL; /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), - gfp, node); + desc->kstat_irqs = alloc_percpu(unsigned int); if (!desc->kstat_irqs) goto err_desc; @@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node) return desc; err_kstat: - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); err_desc: kfree(desc); return NULL; @@ -166,7 +168,7 @@ static void free_desc(unsigned int irq) mutex_unlock(&sparse_irq_lock); free_masks(desc); - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); kfree(desc); } @@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; int __init early_irq_init(void) { int count, i, node = first_online_node; @@ -250,7 +251,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - desc[i].kstat_irqs = kstat_irqs_all[i]; + /* TODO : do this allocation on-demand ... */ + desc[i].kstat_irqs = alloc_percpu(unsigned int); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -275,6 +277,22 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { +#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) + struct irq_desc *desc; + unsigned int i; + + for (i = 0; i < cnt; i++) { + desc = irq_to_desc(start + i); + if (desc && !desc->kstat_irqs) { + unsigned int __percpu *stats = alloc_percpu(unsigned int); + + if (!stats) + return -1; + if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) + free_percpu(stats); + } + } +#endif return start; } #endif /* !CONFIG_SPARSE_IRQ */ @@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; + + return desc && desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } #ifdef CONFIG_GENERIC_HARDIRQS @@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq) int cpu; int sum = 0; - if (!desc) + if (!desc || !desc->kstat_irqs) return 0; for_each_possible_cpu(cpu) - sum += desc->kstat_irqs[cpu]; + sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } #endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit v1.2.3-71-gd317 From 43bb40c9e3aa51a3b038c9df2c9afb4d4685614d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 13 Jan 2011 15:45:40 -0800 Subject: sched: remove long deprecated CLONE_STOPPED flag This warning was added in commit bdff746a3915 ("clone: prepare to recycle CLONE_STOPPED") three years ago. 2.6.26 came and went. As far as I know, no-one is actually using CLONE_STOPPED. Signed-off-by: Dave Jones Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 ++- kernel/fork.c | 28 +--------------------------- 2 files changed, 3 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 96e23215e276..07402530fc70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -21,7 +21,8 @@ #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -#define CLONE_STOPPED 0x02000000 /* Start in stopped state */ +/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) + and is now available for re-use. */ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ #define CLONE_NEWIPC 0x08000000 /* New ipcs */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ diff --git a/kernel/fork.c b/kernel/fork.c index d9b44f20b6b0..76a1fdd80bdf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1409,23 +1409,6 @@ long do_fork(unsigned long clone_flags, return -EPERM; } - /* - * We hope to recycle these flags after 2.6.26 - */ - if (unlikely(clone_flags & CLONE_STOPPED)) { - static int __read_mostly count = 100; - - if (count > 0 && printk_ratelimit()) { - char comm[TASK_COMM_LEN]; - - count--; - printk(KERN_INFO "fork(): process `%s' used deprecated " - "clone flags 0x%lx\n", - get_task_comm(comm, current), - clone_flags & CLONE_STOPPED); - } - } - /* * When called from kernel_thread, don't do user tracing stuff. */ @@ -1464,16 +1447,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - if (unlikely(clone_flags & CLONE_STOPPED)) { - /* - * We'll start up with an immediate SIGSTOP. - */ - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - __set_task_state(p, TASK_STOPPED); - } else { - wake_up_new_task(p, clone_flags); - } + wake_up_new_task(p, clone_flags); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); -- cgit v1.2.3-71-gd317 From 88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:41 -0800 Subject: mm: page allocator: adjust the per-cpu counter threshold when memory is low Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To avoid synchronization overhead, these counters are maintained on a per-cpu basis and drained both periodically and when a threshold is above a threshold. On large CPU systems, the difference between the estimate and real value of NR_FREE_PAGES can be very high. The system can get into a case where pages are allocated far below the min watermark potentially causing livelock issues. The commit solved the problem by taking a better reading of NR_FREE_PAGES when memory was low. Unfortately, as reported by Shaohua Li this accurate reading can consume a large amount of CPU time on systems with many sockets due to cache line bouncing. This patch takes a different approach. For large machines where counter drift might be unsafe and while kswapd is awake, the per-cpu thresholds for the target pgdat are reduced to limit the level of drift to what should be a safe level. This incurs a performance penalty in heavy memory pressure by a factor that depends on the workload and the machine but the machine should function correctly without accidentally exhausting all memory on a node. There is an additional cost when kswapd wakes and sleeps but the event is not expected to be frequent - in Shaohua's test case, there was one recorded sleep and wake event at least. To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is introduced that takes a more accurate reading of NR_FREE_PAGES when called from wakeup_kswapd, when deciding whether it is really safe to go back to sleep in sleeping_prematurely() and when deciding if a zone is really balanced or not in balance_pgdat(). We are still using an expensive function but limiting how often it is called. When the test case is reproduced, the time spent in the watermark functions is reduced. The following report is on the percentage of time spent cumulatively spent in the functions zone_nr_free_pages(), zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(), zone_page_state_snapshot(), zone_page_state(). vanilla 11.6615% disable-threshold 0.2584% David said: : We had to pull aa454840 "mm: page allocator: calculate a better estimate : of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36 : internally because tests showed that it would cause the machine to stall : as the result of heavy kswapd activity. I merged it back with this fix as : it is pending in the -mm tree and it solves the issue we were seeing, so I : definitely think this should be pushed to -stable (and I would seriously : consider it for 2.6.37 inclusion even at this late date). Signed-off-by: Mel Gorman Reported-by: Shaohua Li Reviewed-by: Christoph Lameter Tested-by: Nicolas Bareil Cc: David Rientjes Cc: Kyle McMartin Cc: [2.6.37.1, 2.6.36.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 10 +++----- include/linux/vmstat.h | 5 ++++ mm/mmzone.c | 21 ---------------- mm/page_alloc.c | 35 ++++++++++++++++++++------ mm/vmscan.c | 23 +++++++++-------- mm/vmstat.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 115 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39c24ebe9cfd..48906629335c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -458,12 +458,6 @@ static inline int zone_is_oom_locked(const struct zone *zone) return test_bit(ZONE_OOM_LOCKED, &zone->flags); } -#ifdef CONFIG_SMP -unsigned long zone_nr_free_pages(struct zone *zone); -#else -#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) -#endif /* CONFIG_SMP */ - /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the @@ -661,7 +655,9 @@ typedef struct pglist_data { extern struct mutex zonelists_mutex; void build_all_zonelists(void *data); void wakeup_kswapd(struct zone *zone, int order); -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); enum memmap_context { MEMMAP_EARLY, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index eaaea37b3b75..e4cc21cf5870 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); +void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); +void restore_pgdat_percpu_threshold(pg_data_t *pgdat); #else /* CONFIG_SMP */ /* @@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state +static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } +static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } + static inline void refresh_cpu_vm_stats(int cpu) { } #endif diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb82c855..f5b7d1760213 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - -#ifdef CONFIG_SMP -/* Called when a more accurate view of NR_FREE_PAGES is needed */ -unsigned long zone_nr_free_pages(struct zone *zone) -{ - unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); - - /* - * While kswapd is awake, it is considered the zone is under some - * memory pressure. Under pressure, there is a risk that - * per-cpu-counter-drift will allow the min watermark to be breached - * potentially causing a live-lock. While kswapd is awake and - * free pages are low, get a better estimate for free pages - */ - if (nr_free_pages < zone->percpu_drift_mark && - !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) - return zone_page_state_snapshot(zone, NR_FREE_PAGES); - - return nr_free_pages; -} -#endif /* CONFIG_SMP */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 826ba6922e84..22a1bb7723e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1460,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return 1 if free pages are above 'mark'. This takes into account the order + * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags, long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; + free_pages -= (1 << order) + 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) - return 0; + return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o; @@ -1486,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, min >>= 1; if (free_pages <= min) - return 0; + return false; } - return 1; + return true; +} + +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + zone_page_state(z, NR_FREE_PAGES)); +} + +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + + if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) + free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); } #ifdef CONFIG_NUMA @@ -2442,7 +2461,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_nr_free_pages(zone)), + K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ca587c69274..5da4295e7d67 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2143,7 +2143,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (zone->all_unreclaimable) continue; - if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) return 1; } @@ -2230,7 +2230,7 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; break; @@ -2276,7 +2276,7 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; @@ -2297,7 +2297,7 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; /* @@ -2305,7 +2305,7 @@ loop_again: * means that we have a GFP_ATOMIC allocation * failure risk. Hurry up! */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; } else { @@ -2448,7 +2448,9 @@ static int kswapd(void *p) */ if (!sleeping_prematurely(pgdat, order, remaining)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + restore_pgdat_percpu_threshold(pgdat); schedule(); + reduce_pgdat_percpu_threshold(pgdat); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); @@ -2487,16 +2489,17 @@ void wakeup_kswapd(struct zone *zone, int order) if (!populated_zone(zone)) return; - pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; + pgdat = zone->zone_pgdat; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - return; if (!waitqueue_active(&pgdat->kswapd_wait)) return; + if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + return; + + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 312d728976f1..bc0f095791b4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -83,6 +83,30 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP +static int calculate_pressure_threshold(struct zone *zone) +{ + int threshold; + int watermark_distance; + + /* + * As vmstats are not up to date, there is drift between the estimated + * and real values. For high thresholds and a high number of CPUs, it + * is possible for the min watermark to be breached while the estimated + * value looks fine. The pressure threshold is a reduced value such + * that even the maximum amount of drift will not accidentally breach + * the min watermark + */ + watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); + threshold = max(1, (int)(watermark_distance / num_online_cpus())); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + static int calculate_threshold(struct zone *zone) { int threshold; @@ -161,6 +185,48 @@ static void refresh_zone_stat_thresholds(void) } } +void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + get_online_cpus(); + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = calculate_pressure_threshold(zone); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } + put_online_cpus(); +} + +void restore_pgdat_percpu_threshold(pg_data_t *pgdat) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + get_online_cpus(); + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = calculate_threshold(zone); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } + put_online_cpus(); +} + /* * For use when we know that interrupts are disabled. */ @@ -911,7 +977,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_nr_free_pages(zone), + zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), -- cgit v1.2.3-71-gd317 From b44129b30652c8771db2265939bb8b463724043d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:43 -0800 Subject: mm: vmstat: use a single setter function and callback for adjusting percpu thresholds reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid errors due to counter drift. The functions duplicate some code so this patch replaces them with a single set_pgdat_percpu_threshold() that takes a callback function to calculate the desired threshold as a parameter. [akpm@linux-foundation.org: readability tweak] [kosaki.motohiro@jp.fujitsu.com: set_pgdat_percpu_threshold(): don't use for_each_online_cpu] Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 10 ++++++---- mm/vmscan.c | 19 +++++++++++++++++-- mm/vmstat.c | 36 +++++++----------------------------- 3 files changed, 30 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index e4cc21cf5870..833e676d6d92 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); -void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); -void restore_pgdat_percpu_threshold(pg_data_t *pgdat); + +int calculate_pressure_threshold(struct zone *zone); +int calculate_normal_threshold(struct zone *zone); +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* @@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state -static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } -static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } +#define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_cpu_vm_stats(int cpu) { } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 5da4295e7d67..86f8c3418795 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2448,9 +2448,24 @@ static int kswapd(void *p) */ if (!sleeping_prematurely(pgdat, order, remaining)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); - restore_pgdat_percpu_threshold(pgdat); + + /* + * vmstat counters are not perfectly + * accurate and the estimated value + * for counters such as NR_FREE_PAGES + * can deviate from the true value by + * nr_online_cpus * threshold. To + * avoid the zone watermarks being + * breached while under pressure, we + * reduce the per-cpu vmstat threshold + * while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, + calculate_normal_threshold); schedule(); - reduce_pgdat_percpu_threshold(pgdat); + set_pgdat_percpu_threshold(pgdat, + calculate_pressure_threshold); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); diff --git a/mm/vmstat.c b/mm/vmstat.c index bc0f095791b4..751a65e00aac 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -83,7 +83,7 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -static int calculate_pressure_threshold(struct zone *zone) +int calculate_pressure_threshold(struct zone *zone) { int threshold; int watermark_distance; @@ -107,7 +107,7 @@ static int calculate_pressure_threshold(struct zone *zone) return threshold; } -static int calculate_threshold(struct zone *zone) +int calculate_normal_threshold(struct zone *zone) { int threshold; int mem; /* memory in 128 MB units */ @@ -166,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) for_each_populated_zone(zone) { unsigned long max_drift, tolerate_drift; - threshold = calculate_threshold(zone); + threshold = calculate_normal_threshold(zone); for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold @@ -185,46 +185,24 @@ static void refresh_zone_stat_thresholds(void) } } -void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)) { struct zone *zone; int cpu; int threshold; int i; - get_online_cpus(); - for (i = 0; i < pgdat->nr_zones; i++) { - zone = &pgdat->node_zones[i]; - if (!zone->percpu_drift_mark) - continue; - - threshold = calculate_pressure_threshold(zone); - for_each_online_cpu(cpu) - per_cpu_ptr(zone->pageset, cpu)->stat_threshold - = threshold; - } - put_online_cpus(); -} - -void restore_pgdat_percpu_threshold(pg_data_t *pgdat) -{ - struct zone *zone; - int cpu; - int threshold; - int i; - - get_online_cpus(); for (i = 0; i < pgdat->nr_zones; i++) { zone = &pgdat->node_zones[i]; if (!zone->percpu_drift_mark) continue; - threshold = calculate_threshold(zone); - for_each_online_cpu(cpu) + threshold = (*calculate_pressure)(zone); + for_each_possible_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } - put_online_cpus(); } /* -- cgit v1.2.3-71-gd317 From 3e7d344970673c5334cf7b5bb27c8c0942b06126 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:56 -0800 Subject: mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim Lumpy reclaim is disruptive. It reclaims a large number of pages and ignores the age of the pages it reclaims. This can incur significant stalls and potentially increase the number of major faults. Compaction has reached the point where it is considered reasonably stable (meaning it has passed a lot of testing) and is a potential candidate for displacing lumpy reclaim. This patch introduces an alternative to lumpy reclaim whe compaction is available called reclaim/compaction. The basic operation is very simple - instead of selecting a contiguous range of pages to reclaim, a number of order-0 pages are reclaimed and then compaction is later by either kswapd (compact_zone_order()) or direct compaction (__alloc_pages_direct_compact()). [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: use conventional task_struct naming] Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 14 +++++++ include/linux/kernel.h | 7 ++++ mm/compaction.c | 89 ++++++++++++++++++++++++--------------- mm/migrate.c | 17 ++++++++ mm/page_alloc.c | 16 +++++++ mm/vmscan.c | 102 ++++++++++++++++++++++++++++++++++++++------- 6 files changed, 196 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 5ac51552d908..2592883d862d 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +extern unsigned long compaction_suitable(struct zone *zone, int order); +extern unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, return COMPACT_CONTINUE; } +static inline unsigned long compaction_suitable(struct zone *zone, int order) +{ + return COMPACT_SKIPPED; +} + +static inline unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask) +{ + return 0; +} + static inline void defer_compaction(struct zone *zone) { } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 57dac7022b63..5a9d9059520b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -600,6 +600,13 @@ struct sysinfo { #define NUMA_BUILD 0 #endif +/* This helps us avoid #ifdef CONFIG_COMPACTION */ +#ifdef CONFIG_COMPACTION +#define COMPACTION_BUILD 1 +#else +#define COMPACTION_BUILD 0 +#endif + /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD diff --git a/mm/compaction.c b/mm/compaction.c index 20011a850fef..8fe917ec7c11 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ +unsigned long compaction_suitable(struct zone *zone, int order) +{ + int fragindex; + unsigned long watermark; + + /* + * Watermarks for order-0 must be met for compaction. Note the 2UL. + * This is because during migration, copies of pages need to be + * allocated and for a short time, the footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return COMPACT_SKIPPED; + + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1 implies allocations might succeed dependingon watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + return COMPACT_SKIPPED; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) + return COMPACT_PARTIAL; + + return COMPACT_CONTINUE; +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + ret = compaction_suitable(zone, cc->order); + switch (ret) { + case COMPACT_PARTIAL: + case COMPACT_SKIPPED: + /* Compaction is likely to fail */ + return ret; + case COMPACT_CONTINUE: + /* Fall through to compaction */ + ; + } + /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; @@ -429,7 +481,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } -static unsigned long compact_zone_order(struct zone *zone, +unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask) { struct compact_control cc = { @@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; - unsigned long watermark; struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; @@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - int fragindex; int status; - /* - * Watermarks for order-0 must be met for compaction. Note - * the 2UL. This is because during migration, copies of - * pages need to be allocated and for a short time, the - * footprint is higher - */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - continue; - - /* - * fragmentation index determines if allocation failures are - * due to low memory or external fragmentation - * - * index of -1 implies allocations might succeed depending - * on watermarks - * index towards 0 implies failure is due to lack of memory - * index towards 1000 implies failure is due to fragmentation - * - * Only compact if a failure would be due to fragmentation. - */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - continue; - - if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { - rc = COMPACT_PARTIAL; - break; - } - status = compact_zone_order(zone, order, gfp_mask); rc = max(status, rc); - if (zone_watermark_ok(zone, order, watermark, 0, 0)) + /* If a normal allocation would succeed, stop compacting */ + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; } diff --git a/mm/migrate.c b/mm/migrate.c index 6ae8a66a7045..94875b265928 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!trylock_page(page)) { if (!force) goto move_newpage; + + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readpages). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + goto move_newpage; + lock_page(page); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22a1bb7723e4..03a66a31bfcd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, int migratetype, unsigned long *did_some_progress) { struct page *page; + struct task_struct *tsk = current; if (!order || compaction_deferred(preferred_zone)) return NULL; + tsk->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask); + tsk->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ @@ -2121,6 +2124,19 @@ rebalance: /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; + } else { + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress); + if (page) + goto got_pg; } nopage: diff --git a/mm/vmscan.c b/mm/vmscan.c index 3464312bde07..10ebd74a423c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -59,12 +60,15 @@ * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference * page from the LRU and reclaim all pages within a * naturally aligned range + * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of + * order-0 pages and then compact the zone */ typedef unsigned __bitwise__ lumpy_mode; #define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u) #define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u) #define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u) #define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u) +#define LUMPY_MODE_COMPACTION ((__force lumpy_mode)0x10u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; /* - * Some reclaim have alredy been failed. No worth to try synchronous - * lumpy reclaim. + * Initially assume we are entering either lumpy reclaim or + * reclaim/compaction.Depending on the order, we will either set the + * sync mode or just reclaim order-0 pages later. */ - if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) - return; + if (COMPACTION_BUILD) + sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION; + else + sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. + * Avoid using lumpy reclaim or reclaim/compaction if possible by + * restricting when its set to either costly allocations or when + * under memory pressure */ - sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; if (sc->order > PAGE_ALLOC_COSTLY_ORDER) sc->lumpy_reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) @@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); /* @@ -1814,6 +1820,57 @@ out: } } +/* + * Reclaim/compaction depends on a number of pages being freed. To avoid + * disruption to the system, a small number of order-0 pages continue to be + * rotated and reclaimed in the normal fashion. However, by the time we get + * back to the allocator and call try_to_compact_zone(), we ensure that + * there are enough free pages for it to be likely successful + */ +static inline bool should_continue_reclaim(struct zone *zone, + unsigned long nr_reclaimed, + unsigned long nr_scanned, + struct scan_control *sc) +{ + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + + /* If not in reclaim/compaction mode, stop */ + if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION)) + return false; + + /* + * If we failed to reclaim and have scanned the full list, stop. + * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far + * faster but obviously would be less likely to succeed + * allocation. If this is desirable, use GFP_REPEAT to decide + * if both reclaimed and scanned should be checked or just + * reclaimed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; + + /* If compaction would go ahead or the allocation would succeed, stop */ + switch (compaction_suitable(zone, sc->order)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + return true; + } +} + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long nr_reclaimed; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + unsigned long nr_scanned = sc->nr_scanned; +restart: + nr_reclaimed = 0; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone, if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } - - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to @@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone, if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + /* reclaim/compaction might need reclaim to continue */ + if (should_continue_reclaim(zone, nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)) + goto restart; + throttle_vm_writeout(sc->gfp_mask); } @@ -2307,6 +2371,14 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + /* + * Compact the zone for higher orders to reduce + * latencies for higher-order allocations that + * would ordinarily call try_to_compact_pages() + */ + if (sc.order > PAGE_ALLOC_COSTLY_ORDER) + compact_zone_order(zone, sc.order, sc.gfp_mask); + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; -- cgit v1.2.3-71-gd317 From 77f1fe6b08b13a87391549c8a820ddc817b6f50e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:57 -0800 Subject: mm: migration: allow migration to operate asynchronously and avoid synchronous compaction in the faster path Migration synchronously waits for writeback if the initial passes fails. Callers of memory compaction do not necessarily want this behaviour if the caller is latency sensitive or expects that synchronous migration is not going to have a significantly better success rate. This patch adds a sync parameter to migrate_pages() allowing the caller to indicate if wait_on_page_writeback() is allowed within migration or not. For reclaim/compaction, try_to_compact_pages() is first called asynchronously, direct reclaim runs and then try_to_compact_pages() is called synchronously as there is a greater expectation that it'll succeed. [akpm@linux-foundation.org: build/merge fix] Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 10 ++++++---- include/linux/migrate.h | 12 ++++++++---- mm/compaction.c | 14 ++++++++++---- mm/memory-failure.c | 8 +++++--- mm/memory_hotplug.c | 3 ++- mm/mempolicy.c | 4 ++-- mm/migrate.c | 22 +++++++++++++--------- mm/page_alloc.c | 21 +++++++++++++++------ mm/vmscan.c | 3 ++- 9 files changed, 63 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 2592883d862d..72cba4034785 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -21,10 +21,11 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *mask); + int order, gfp_t gfp_mask, nodemask_t *mask, + bool sync); extern unsigned long compaction_suitable(struct zone *zone, int order); extern unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask); + gfp_t gfp_mask, bool sync); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -57,7 +58,8 @@ static inline bool compaction_deferred(struct zone *zone) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { return COMPACT_CONTINUE; } @@ -68,7 +70,7 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order) } static inline unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, bool sync) { return 0; } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 085527fb8261..fa31902803fd 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, int offlining, + bool sync); extern int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, int offlining, + bool sync); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, int offlining, + bool sync) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, int offlining, + bool sync) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index 8fe917ec7c11..47fca1069343 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -33,6 +33,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ /* Account for isolated anon and file pages */ unsigned long nr_anon; @@ -455,7 +456,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, 0); + (unsigned long)cc, 0, + cc->sync); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -482,7 +484,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask) + int order, gfp_t gfp_mask, + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -490,6 +493,7 @@ unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, + .sync = sync, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -505,11 +509,13 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from + * @sync: Whether migration is synchronous or not * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -533,7 +539,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 46ab2c044b0e..2323a8039a98 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1290,9 +1290,10 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_add(&hpage->lru, &pagelist); - ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, + true); if (ret) { - putback_lru_pages(&pagelist); + putback_lru_pages(&pagelist); pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) @@ -1413,7 +1414,8 @@ int soft_offline_page(struct page *page, int flags) LIST_HEAD(pagelist); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + 0, true); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2c6523af5473..584fc5588fdd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -733,7 +733,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto out; } /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, + 1, true); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 11ff260fb282..9db274593086 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -935,7 +935,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return PTR_ERR(vma); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, 0); + err = migrate_pages(&pagelist, new_node_page, dest, 0, true); if (err) putback_lru_pages(&pagelist); } @@ -1155,7 +1155,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, 0); + (unsigned long)vma, 0, true); if (nr_failed) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 94875b265928..dc47f6c40353 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -614,7 +614,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, int offlining) + struct page *page, int force, int offlining, bool sync) { int rc = 0; int *result = NULL; @@ -682,7 +682,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, BUG_ON(charge); if (PageWriteback(page)) { - if (!force) + if (!force || !sync) goto uncharge; wait_on_page_writeback(page); } @@ -827,7 +827,7 @@ move_newpage: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, int offlining) + int force, int offlining, bool sync) { int rc = 0; int *result = NULL; @@ -841,7 +841,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force) + if (!force || !sync) goto out; lock_page(hpage); } @@ -909,7 +909,8 @@ out: * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, int offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -929,7 +930,8 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2, offlining); + page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -958,7 +960,8 @@ out: } int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, int offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -974,7 +977,8 @@ int migrate_huge_pages(struct list_head *from, cond_resched(); rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, offlining); + private, page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -1107,7 +1111,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0); + (unsigned long)pm, 0, true); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03a66a31bfcd..0fd486467b4b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1812,7 +1812,8 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { struct page *page; struct task_struct *tsk = current; @@ -1822,7 +1823,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, tsk->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask); + nodemask, sync_migration); tsk->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { @@ -1859,7 +1860,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { return NULL; } @@ -2001,6 +2003,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; struct task_struct *p = current; + bool sync_migration = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2063,14 +2066,19 @@ rebalance: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* Try direct compaction */ + /* + * Try direct compaction. The first pass is asynchronous. Subsequent + * attempts after direct reclaim are synchronous + */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + migratetype, &did_some_progress, + sync_migration); if (page) goto got_pg; + sync_migration = true; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, @@ -2134,7 +2142,8 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + migratetype, &did_some_progress, + sync_migration); if (page) goto got_pg; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 10ebd74a423c..8320d115c85d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2377,7 +2377,8 @@ loop_again: * would ordinarily call try_to_compact_pages() */ if (sc.order > PAGE_ALLOC_COSTLY_ORDER) - compact_zone_order(zone, sc.order, sc.gfp_mask); + compact_zone_order(zone, sc.order, sc.gfp_mask, + false); if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { -- cgit v1.2.3-71-gd317 From 7f0f24967b0349798803260b2e4bf347cffa1990 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:58 -0800 Subject: mm: migration: cleanup migrate_pages API by matching types for offlining and sync With the introduction of the boolean sync parameter, the API looks a little inconsistent as offlining is still an int. Convert offlining to a bool for the sake of being tidy. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 8 ++++---- mm/compaction.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 6 ++++-- mm/migrate.c | 8 ++++---- 5 files changed, 14 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index fa31902803fd..e39aeecfe9a2 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,10 +13,10 @@ extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync); extern int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync); extern int fail_migrate_page(struct address_space *, @@ -35,10 +35,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index 47fca1069343..e005a30e968c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -456,7 +456,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, 0, + (unsigned long)cc, false, cc->sync); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 584fc5588fdd..a2832c092509 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -734,7 +734,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } /* this function returns # of failed pages */ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, - 1, true); + true, true); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9db274593086..7ee55af8d79c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -935,7 +935,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return PTR_ERR(vma); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, 0, true); + err = migrate_pages(&pagelist, new_node_page, dest, + false, true); if (err) putback_lru_pages(&pagelist); } @@ -1155,7 +1156,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, 0, true); + (unsigned long)vma, + false, true); if (nr_failed) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index dc47f6c40353..690d0de993af 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -614,7 +614,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, int offlining, bool sync) + struct page *page, int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; @@ -827,7 +827,7 @@ move_newpage: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, int offlining, bool sync) + int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; @@ -909,7 +909,7 @@ out: * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining, + new_page_t get_new_page, unsigned long private, bool offlining, bool sync) { int retry = 1; @@ -960,7 +960,7 @@ out: } int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining, + new_page_t get_new_page, unsigned long private, bool offlining, bool sync) { int retry = 1; -- cgit v1.2.3-71-gd317 From e5a5623b28198aa91ea71ee5d3846757fc76bc87 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:00 -0800 Subject: mm: remove unused get_vm_area_node get_vm_area_node() is unused in the kernel and can thus be removed. Signed-off-by: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 --- mm/vmalloc.c | 7 ------- 2 files changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 44b54f619ac6..cb73c755fac8 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -90,9 +90,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller); -extern struct vm_struct *get_vm_area_node(unsigned long size, - unsigned long flags, int node, - gfp_t gfp_mask); extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 48245af07b10..78ec9d8bc57c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1315,13 +1315,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, -1, GFP_KERNEL, caller); } -struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, - int node, gfp_t gfp_mask) -{ - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - node, gfp_mask, __builtin_return_address(0)); -} - static struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; -- cgit v1.2.3-71-gd317 From ec3f64fc9c196a304c4b7db3e1ff56d640628509 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:01 -0800 Subject: mm: remove gfp mask from pcpu_get_vm_areas pcpu_get_vm_areas() only uses GFP_KERNEL allocations, so remove the gfp_t formal and use the mask internally. Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 2 +- mm/percpu-vm.c | 2 +- mm/vmalloc.c | 21 +++++++++------------ 3 files changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index cb73c755fac8..c7348b8d0a81 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -117,7 +117,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); #ifdef CONFIG_SMP struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask); + size_t align); void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); #endif diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 7d9c1d0ebd3f..ea534960a04b 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, - pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); + pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 78ec9d8bc57c..f67546636322 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2196,17 +2196,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this - * @gfp_mask: allocation mask * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates - * congruent vmalloc areas for it. These areas tend to be scattered - * pretty far, distance between two areas easily going up to - * gigabytes. To avoid interacting with regular vmallocs, these areas - * are allocated from top. + * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to + * be scattered pretty far, distance between two areas easily going up + * to gigabytes. To avoid interacting with regular vmallocs, these + * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans areas from the end looking for @@ -2217,7 +2216,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask) + size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); @@ -2227,8 +2226,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, unsigned long base, start, end, last_end; bool purged = false; - gfp_mask &= GFP_RECLAIM_MASK; - /* verify parameters and allocate data structures */ BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { @@ -2261,14 +2258,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); - vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); + vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) goto err_free; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); - vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } -- cgit v1.2.3-71-gd317 From d0a21265dfb5fa8ae54e90d0fb6d1c215b10a28a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:02 -0800 Subject: mm: unify module_alloc code for vmalloc Four architectures (arm, mips, sparc, x86) use __vmalloc_area() for module_init(). Much of the code is duplicated and can be generalized in a globally accessible function, __vmalloc_node_range(). __vmalloc_node() now calls into __vmalloc_node_range() with a range of [VMALLOC_START, VMALLOC_END) for functionally equivalent behavior. Each architecture may then use __vmalloc_node_range() directly to remove the duplication of code. Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Russell King Cc: Ralf Baechle Cc: "David S. Miller" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/module.c | 14 +++---------- arch/mips/kernel/module.c | 14 +++---------- arch/sparc/kernel/module.c | 14 ++++--------- arch/x86/kernel/module.c | 17 ++++------------ include/linux/vmalloc.h | 5 +++-- mm/vmalloc.c | 50 +++++++++++++++++++++++++++------------------- 6 files changed, 46 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 0c1bb68ff4a8..2cfe8161b478 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -38,17 +38,9 @@ #ifdef CONFIG_MMU void *module_alloc(unsigned long size) { - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, -1, + __builtin_return_address(0)); } #else /* CONFIG_MMU */ void *module_alloc(unsigned long size) diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 6f51dda87fce..d87a72e9fac7 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -46,17 +46,9 @@ static DEFINE_SPINLOCK(dbe_lock); void *module_alloc(unsigned long size) { #ifdef MODULE_START - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULE_START, MODULE_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, + GFP_KERNEL, PAGE_KERNEL, -1, + __builtin_return_address(0)); #else if (size == 0) return NULL; diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index ee3c7dde8d9f..8d348c474a2f 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -23,17 +23,11 @@ static void *module_map(unsigned long size) { - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size || size > MODULES_LEN) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL, -1, + __builtin_return_address(0)); } static char *dot2underscore(char *name) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8f2956091735..ab23f1ad4bf1 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -37,20 +37,11 @@ void *module_alloc(unsigned long size) { - struct vm_struct *area; - - if (!size) - return NULL; - size = PAGE_ALIGN(size); - if (size > MODULES_LEN) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + -1, __builtin_return_address(0)); } /* Free memory returned from module_alloc */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c7348b8d0a81..4ed6fcd6b726 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot); +extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller); extern void vfree(const void *addr); extern void *vmap(struct page **pages, unsigned int count, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f67546636322..284346ee0e91 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1530,25 +1530,12 @@ fail: return NULL; } -void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) -{ - void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, - __builtin_return_address(0)); - - /* - * A ref_count = 3 is needed because the vm_struct and vmap_area - * structures allocated in the __get_vm_area_node() function contain - * references to the virtual address of the vmalloc'ed block. - */ - kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); - - return addr; -} - /** - * __vmalloc_node - allocate virtually contiguous memory + * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment + * @start: vm area range start + * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1558,9 +1545,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller) { struct vm_struct *area; void *addr; @@ -1570,8 +1557,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, - VMALLOC_END, node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, + gfp_mask, caller); if (!area) return NULL; @@ -1588,6 +1575,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, return addr; } +/** + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or -1 + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) +{ + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp_mask, prot, node, caller); +} + void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { return __vmalloc_node(size, 1, gfp_mask, prot, -1, -- cgit v1.2.3-71-gd317 From dabb16f639820267b3850d804571c70bd93d4e07 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 13 Jan 2011 15:46:05 -0800 Subject: oom: allow a non-CAP_SYS_RESOURCE proces to oom_score_adj down We'd like to be able to oom_score_adj a process up/down as it enters/leaves the foreground. Currently, it is not possible to oom_adj down without CAP_SYS_RESOURCE. This patch allows a task to decrease its oom_score_adj back to the value that a CAP_SYS_RESOURCE thread set it to or its inherited value at fork. Assuming the thread that has forked it has oom_score_adj of 0, each process could decrease it back from 0 upon activation unless a CAP_SYS_RESOURCE thread elevated it to something higher. Alternative considered: * a setuid binary * a daemon with CAP_SYS_RESOURCE Since you don't wan't all processes to be able to reduce their oom_adj, a setuid or daemon implementation would be complex. The alternatives also have much higher overhead. This patch updated from original patch based on feedback from David Rientjes. Signed-off-by: Mandeep Singh Baines Acked-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 4 ++++ fs/proc/base.c | 4 +++- include/linux/sched.h | 2 ++ kernel/fork.c | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index ef757fca470b..23cae6548d3a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1323,6 +1323,10 @@ scaled linearly with /proc//oom_score_adj. Writing to /proc//oom_score_adj or /proc//oom_adj will change the other with its scaled value. +The value of /proc//oom_score_adj may be reduced no lower than the last +value set by a CAP_SYS_RESOURCE process. To reduce the value any lower +requires CAP_SYS_RESOURCE. + NOTICE: /proc//oom_adj is deprecated and will be removed, please see Documentation/feature-removal-schedule.txt. diff --git a/fs/proc/base.c b/fs/proc/base.c index 93f1cdd5d3d7..9d096e82b201 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_task_lock; } - if (oom_score_adj < task->signal->oom_score_adj && + if (oom_score_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; @@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, atomic_dec(&task->mm->oom_disable_count); } task->signal->oom_score_adj = oom_score_adj; + if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = oom_score_adj; /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. diff --git a/include/linux/sched.h b/include/linux/sched.h index 07402530fc70..f23b5bb6f52e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -634,6 +634,8 @@ struct signal_struct { int oom_adj; /* OOM kill score adjustment (bit shift) */ int oom_score_adj; /* OOM kill score adjustment */ + int oom_score_adj_min; /* OOM kill score adjustment minimum value. + * Only settable by CAP_SYS_RESOURCE. */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations diff --git a/kernel/fork.c b/kernel/fork.c index 76a1fdd80bdf..1499607e4da2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -910,6 +910,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); -- cgit v1.2.3-71-gd317 From 212260aa07135b327752dc02625c68cf4ce04caf Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:46:06 -0800 Subject: mm: clear PageError bit in msync & fsync Temporary IO failures, eg. due to loss of both multipath paths, can permanently leave the PageError bit set on a page, resulting in msync or fsync returning -EIO over and over again, even if IO is now getting to the disk correctly. We already clear the AS_ENOSPC and AS_IO bits in mapping->flags in the filemap_fdatawait_range function. Also clearing the PageError bit on the page allows subsequent msync or fsync calls on this file to return without an error, if the subsequent IO succeeds. Unfortunately data written out in the msync or fsync call that returned -EIO can still get lost, because the page dirty bit appears to not get restored on IO error. However, the alternative could be potentially all of memory filling up with uncleanable dirty pages, hanging the system, so there is no nice choice here... Signed-off-by: Rik van Riel Acked-by: Valerie Aurora Acked-by: Jeff Layton Cc: Theodore Ts'o Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- mm/filemap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5f38c460367e..4805fdec8d6e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -198,7 +198,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked) -PAGEFLAG(Error, error) +PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) diff --git a/mm/filemap.c b/mm/filemap.c index 1a3dd5914726..b4ad8e36c81a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -298,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, continue; wait_on_page_writeback(page); - if (PageError(page)) + if (TestClearPageError(page)) ret = -EIO; } pagevec_release(&pvec); -- cgit v1.2.3-71-gd317 From 110d74a921f4d272b47ef6104fcf937df808f4c8 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:11 -0800 Subject: mm: add FOLL_MLOCK follow_page flag. Move the code to mlock pages from __mlock_vma_pages_range() to follow_page(). This allows __mlock_vma_pages_range() to not have to break down work into 16-page batches. An additional motivation for doing this within the present patch series is that it'll make it easier for a later chagne to drop mmap_sem when blocking on disk (we'd like to be able to resume at the page that was read from disk instead of at the start of a 16-page batch). Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory.c | 22 ++++++++++++++++++ mm/mlock.c | 65 +++++------------------------------------------------- 3 files changed, 28 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 721f451c3029..9ade803bcc1b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1415,6 +1415,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_MLOCK 0x40 /* mark page as mlocked */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/memory.c b/mm/memory.c index b8f97b8575b7..15e1f19a3b10 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1310,6 +1310,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, */ mark_page_accessed(page); } + if (flags & FOLL_MLOCK) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here and migration is + * blocked by the pte's page reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } unlock: pte_unmap_unlock(ptep, ptl); out: diff --git a/mm/mlock.c b/mm/mlock.c index 67b3dd8616dc..25cc9e88c540 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -159,10 +159,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; - struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; - int ret = 0; int gup_flags; + int ret; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); @@ -170,7 +169,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH | FOLL_GET; + gup_flags = FOLL_TOUCH | FOLL_MLOCK; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -185,63 +184,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, nr_pages--; } - while (nr_pages > 0) { - int i; - - cond_resched(); - - /* - * get_user_pages makes pages present if we are - * setting mlock. and this extra reference count will - * disable migration of this page. However, page may - * still be truncated out from under us. - */ - ret = __get_user_pages(current, mm, addr, - min_t(int, nr_pages, ARRAY_SIZE(pages)), - gup_flags, pages, NULL); - /* - * This can happen for, e.g., VM_NONLINEAR regions before - * a page has been allocated and mapped at a given offset, - * or for addresses that map beyond end of a file. - * We'll mlock the pages if/when they get faulted in. - */ - if (ret < 0) - break; - - lru_add_drain(); /* push cached pages to LRU */ - - for (i = 0; i < ret; i++) { - struct page *page = pages[i]; - - if (page->mapping) { - /* - * That preliminary check is mainly to avoid - * the pointless overhead of lock_page on the - * ZERO_PAGE: which might bounce very badly if - * there is contention. However, we're still - * dirtying its cacheline with get/put_page: - * we'll add another __get_user_pages flag to - * avoid it if that case turns out to matter. - */ - lock_page(page); - /* - * Because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - mlock_vma_page(page); - unlock_page(page); - } - put_page(page); /* ref from get_user_pages() */ - } - - addr += ret * PAGE_SIZE; - nr_pages -= ret; - ret = 0; - } - - return ret; /* 0 or negative error code */ + ret = __get_user_pages(current, mm, addr, nr_pages, gup_flags, + NULL, NULL); + return max(ret, 0); /* 0 or negative error code */ } /* -- cgit v1.2.3-71-gd317 From 088e54658f559a13c2b988086512d76fe9e8f846 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Jan 2011 15:46:16 -0800 Subject: mm: remove likely() from mapping_unevictable() The mapping_unevictable() has a likely() around the mapping parameter. This mapping parameter comes from page_mapping() which has an unlikely() that the page will be set as PAGE_MAPPING_ANON, and if so, it will return NULL. One would think that this unlikely() means that the mapping returned by page_mapping() would not be NULL, but where page_mapping() is used just above mapping_unevictable(), that unlikely() is incorrect most of the time. This means that the "likely(mapping)" in mapping_unevictable() is incorrect most of the time. Running the annotated branch profiler on my main box which runs firefox, evolution, xchat and is part of my distcc farm, I had this: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 12872836 1269443893 98 mapping_unevictable pagemap.h 51 35935762 1270265395 97 page_mapping mm.h 659 1306198001 143659 0 page_mapping mm.h 657 203131478 121586 0 page_mapping mm.h 657 5415491 1116 0 page_mapping mm.h 657 74899487 1116 0 page_mapping mm.h 657 203132845 224 0 page_mapping mm.h 659 5415464 27 0 page_mapping mm.h 659 13552 0 0 page_mapping mm.h 657 13552 0 0 page_mapping mm.h 659 242630 0 0 page_mapping mm.h 657 242630 0 0 page_mapping mm.h 659 74899487 0 0 page_mapping mm.h 659 The page_mapping() is a static inline, which is why it shows up multiple times. The mapping_unevictable() is also a static inline but seems to be used only once in my setup. The unlikely in page_mapping() was correct a total of 1909540379 times and incorrect 1270533123 times, with a 39% being incorrect. Perhaps this is enough to remove the unlikely from page_mapping() as well. Signed-off-by: Steven Rostedt Reviewed-by: KOSAKI Motohiro Acked-by: Nick Piggin Acked-by: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2d1ffe3cf1ee..9c66e994540f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -48,7 +48,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping) static inline int mapping_unevictable(struct address_space *mapping) { - if (likely(mapping)) + if (mapping) return test_bit(AS_UNEVICTABLE, &mapping->flags); return !!mapping; } -- cgit v1.2.3-71-gd317 From e20e87795834f2f14cb53baf657b91d9c39f92c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Jan 2011 15:46:17 -0800 Subject: mm: remove unlikely() from page_mapping() page_mapping() has a unlikely that the mapping has PAGE_MAPPING_ANON set. But running the annotated branch profiler on a normal desktop system doing vairous tasks (xchat, evolution, firefox, distcc), it is not really that unlikely that the mapping here will have the PAGE_MAPPING_ANON flag set: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 35935762 1270265395 97 page_mapping mm.h 659 1306198001 143659 0 page_mapping mm.h 657 203131478 121586 0 page_mapping mm.h 657 5415491 1116 0 page_mapping mm.h 657 74899487 1116 0 page_mapping mm.h 657 203132845 224 0 page_mapping mm.h 659 5415464 27 0 page_mapping mm.h 659 13552 0 0 page_mapping mm.h 657 13552 0 0 page_mapping mm.h 659 242630 0 0 page_mapping mm.h 657 242630 0 0 page_mapping mm.h 659 74899487 0 0 page_mapping mm.h 659 The page_mapping() is a static inline, which is why it shows up multiple times. The unlikely in page_mapping() was correct a total of 1909540379 times and incorrect 1270533123 times, with a 39% being incorrect. With this much of an error, it's best to simply remove the unlikely and have the compiler and branch prediction figure this out. Signed-off-by: Steven Rostedt Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9ade803bcc1b..57e11b2a18ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -657,7 +657,7 @@ static inline struct address_space *page_mapping(struct page *page) VM_BUG_ON(PageSlab(page)); if (unlikely(PageSwapCache(page))) mapping = &swapper_space; - else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON)) + else if ((unsigned long)mapping & PAGE_MAPPING_ANON) mapping = NULL; return mapping; } -- cgit v1.2.3-71-gd317 From 9950474883e027e6e728cbcff25f7f2bf0c96530 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:20 -0800 Subject: mm: kswapd: stop high-order balancing when any suitable zone is balanced Simon Kirby reported the following problem We're seeing cases on a number of servers where cache never fully grows to use all available memory. Sometimes we see servers with 4 GB of memory that never seem to have less than 1.5 GB free, even with a constantly-active VM. In some cases, these servers also swap out while this happens, even though they are constantly reading the working set into memory. We have been seeing this happening for a long time; I don't think it's anything recent, and it still happens on 2.6.36. After some debugging work by Simon, Dave Hansen and others, the prevaling theory became that kswapd is reclaiming order-3 pages requested by SLUB too aggressive about it. There are two apparent problems here. On the target machine, there is a small Normal zone in comparison to DMA32. As kswapd tries to balance all zones, it would continually try reclaiming for Normal even though DMA32 was balanced enough for callers. The second problem is that sleeping_prematurely() does not use the same logic as balance_pgdat() when deciding whether to sleep or not. This keeps kswapd artifically awake. A number of tests were run and the figures from previous postings will look very different for a few reasons. One, the old figures were forcing my network card to use GFP_ATOMIC in attempt to replicate Simon's problem. Second, I previous specified slub_min_order=3 again in an attempt to reproduce Simon's problem. In this posting, I'm depending on Simon to say whether his problem is fixed or not and these figures are to show the impact to the ordinary cases. Finally, the "vmscan" figures are taken from /proc/vmstat instead of the tracepoints. There is less information but recording is less disruptive. The first test of relevance was postmark with a process running in the background reading a large amount of anonymous memory in blocks. The objective was to vaguely simulate what was happening on Simon's machine and it's memory intensive enough to have kswapd awake. POSTMARK traceonly kanyzone Transactions per second: 156.00 ( 0.00%) 153.00 (-1.96%) Data megabytes read per second: 21.51 ( 0.00%) 21.52 ( 0.05%) Data megabytes written per second: 29.28 ( 0.00%) 29.11 (-0.58%) Files created alone per second: 250.00 ( 0.00%) 416.00 (39.90%) Files create/transact per second: 79.00 ( 0.00%) 76.00 (-3.95%) Files deleted alone per second: 520.00 ( 0.00%) 420.00 (-23.81%) Files delete/transact per second: 79.00 ( 0.00%) 76.00 (-3.95%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 16.58 17.4 Total Elapsed Time (seconds) 218.48 222.47 VMstat Reclaim Statistics: vmscan Direct reclaims 0 4 Direct reclaim pages scanned 0 203 Direct reclaim pages reclaimed 0 184 Kswapd pages scanned 326631 322018 Kswapd pages reclaimed 312632 309784 Kswapd low wmark quickly 1 4 Kswapd high wmark quickly 122 475 Kswapd skip congestion_wait 1 0 Pages activated 700040 705317 Pages deactivated 212113 203922 Pages written 9875 6363 Total pages scanned 326631 322221 Total pages reclaimed 312632 309968 %age total pages scanned/reclaimed 95.71% 96.20% %age total pages scanned/written 3.02% 1.97% proc vmstat: Faults Major Faults 300 254 Minor Faults 645183 660284 Page ins 493588 486704 Page outs 4960088 4986704 Swap ins 1230 661 Swap outs 9869 6355 Performance is mildly affected because kswapd is no longer doing as much work and the background memory consumer process is getting in the way. Note that kswapd scanned and reclaimed fewer pages as it's less aggressive and overall fewer pages were scanned and reclaimed. Swap in/out is particularly reduced again reflecting kswapd throwing out fewer pages. The slight performance impact is unfortunate here but it looks like a direct result of kswapd being less aggressive. As the bug report is about too many pages being freed by kswapd, it may have to be accepted for now. The second test is a streaming IO benchmark that was previously used by Johannes to show regressions in page reclaim. MICRO traceonly kanyzone User/Sys Time Running Test (seconds) 29.29 28.87 Total Elapsed Time (seconds) 492.18 488.79 VMstat Reclaim Statistics: vmscan Direct reclaims 2128 1460 Direct reclaim pages scanned 2284822 1496067 Direct reclaim pages reclaimed 148919 110937 Kswapd pages scanned 15450014 16202876 Kswapd pages reclaimed 8503697 8537897 Kswapd low wmark quickly 3100 3397 Kswapd high wmark quickly 1860 7243 Kswapd skip congestion_wait 708 801 Pages activated 9635 9573 Pages deactivated 1432 1271 Pages written 223 1130 Total pages scanned 17734836 17698943 Total pages reclaimed 8652616 8648834 %age total pages scanned/reclaimed 48.79% 48.87% %age total pages scanned/written 0.00% 0.01% proc vmstat: Faults Major Faults 165 221 Minor Faults 9655785 9656506 Page ins 3880 7228 Page outs 37692940 37480076 Swap ins 0 69 Swap outs 19 15 Again fewer pages are scanned and reclaimed as expected and this time the test completed faster. Note that kswapd is hitting its watermarks faster (low and high wmark quickly) which I expect is due to kswapd reclaiming fewer pages. I also ran fs-mark, iozone and sysbench but there is nothing interesting to report in the figures. Performance is not significantly changed and the reclaim statistics look reasonable. Tgis patch: When the allocator enters its slow path, kswapd is woken up to balance the node. It continues working until all zones within the node are balanced. For order-0 allocations, this makes perfect sense but for higher orders it can have unintended side-effects. If the zone sizes are imbalanced, kswapd may reclaim heavily within a smaller zone discarding an excessive number of pages. The user-visible behaviour is that kswapd is awake and reclaiming even though plenty of pages are free from a suitable zone. This patch alters the "balance" logic for high-order reclaim allowing kswapd to stop if any suitable zone becomes balanced to reduce the number of pages it reclaims from other zones. kswapd still tries to ensure that order-0 watermarks for all zones are met before sleeping. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Eric B Munson Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 ++- mm/page_alloc.c | 8 +++--- mm/vmscan.c | 68 +++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 66 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 48906629335c..dad3612a7e39 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -639,6 +639,7 @@ typedef struct pglist_data { wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + enum zone_type classzone_idx; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -654,7 +655,7 @@ typedef struct pglist_data { extern struct mutex zonelists_mutex; void build_all_zonelists(void *data); -void wakeup_kswapd(struct zone *zone, int order); +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0fd486467b4b..c31460918506 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1944,13 +1944,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, static inline void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx) + enum zone_type high_zoneidx, + enum zone_type classzone_idx) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wakeup_kswapd(zone, order, classzone_idx); } static inline int @@ -2028,7 +2029,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx); + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background diff --git a/mm/vmscan.c b/mm/vmscan.c index 7037cc8c60b6..3584067800e1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2246,11 +2246,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order, + int classzone_idx) { int all_zones_ok; + int any_zone_ok; int priority; int i; + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { @@ -2273,7 +2276,6 @@ loop_again: count_vm_event(PAGEOUTRUN); for (priority = DEF_PRIORITY; priority >= 0; priority--) { - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; @@ -2282,6 +2284,7 @@ loop_again: disable_swap_token(); all_zones_ok = 1; + any_zone_ok = 0; /* * Scan in the highmem->dma direction for the highest @@ -2400,10 +2403,12 @@ loop_again: * spectulatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); + if (i <= classzone_idx) + any_zone_ok = 1; } } - if (all_zones_ok) + if (all_zones_ok || (order && any_zone_ok)) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2426,7 +2431,13 @@ loop_again: break; } out: - if (!all_zones_ok) { + + /* + * order-0: All zones must meet high watermark for a balanced node + * high-order: Any zone below pgdats classzone_idx must meet the high + * watermark for a balanced node + */ + if (!(all_zones_ok || (order && any_zone_ok))) { cond_resched(); try_to_freeze(); @@ -2451,6 +2462,36 @@ out: goto loop_again; } + /* + * If kswapd was reclaiming at a higher order, it has the option of + * sleeping without all zones being balanced. Before it does, it must + * ensure that the watermarks for order-0 on *all* zones are met and + * that the congestion flags are cleared. The congestion flag must + * be cleared as kswapd is the only mechanism that clears the flag + * and it is potentially going to sleep here. + */ + if (order) { + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; + + /* Confirm the zone is balanced for order-0 */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone), 0, 0)) { + order = sc.order = 0; + goto loop_again; + } + + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); + } + } + return sc.nr_reclaimed; } @@ -2514,6 +2555,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) static int kswapd(void *p) { unsigned long order; + int classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2544,21 +2586,27 @@ static int kswapd(void *p) set_freezable(); order = 0; + classzone_idx = MAX_NR_ZONES - 1; for ( ; ; ) { unsigned long new_order; + int new_classzone_idx; int ret; new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - if (order < new_order) { + pgdat->classzone_idx = MAX_NR_ZONES - 1; + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' - * allocation + * allocation or has tigher zone constraints */ order = new_order; + classzone_idx = new_classzone_idx; } else { kswapd_try_to_sleep(pgdat, order); order = pgdat->kswapd_max_order; + classzone_idx = pgdat->classzone_idx; } ret = try_to_freeze(); @@ -2571,7 +2619,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balance_pgdat(pgdat, order); + balance_pgdat(pgdat, order, classzone_idx); } } return 0; @@ -2580,7 +2628,7 @@ static int kswapd(void *p) /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; @@ -2590,8 +2638,10 @@ void wakeup_kswapd(struct zone *zone, int order) if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; pgdat = zone->zone_pgdat; - if (pgdat->kswapd_max_order < order) + if (pgdat->kswapd_max_order < order) { pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } if (!waitqueue_active(&pgdat->kswapd_wait)) return; if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) -- cgit v1.2.3-71-gd317 From e9da73d67729b58bba256123e2b4651e0d8a01ac Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:32 -0800 Subject: thp: compound_lock Add a new compound_lock() needed to serialize put_page against __split_huge_page_refcount(). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/page-flags.h | 12 +++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 57e11b2a18ba..3b1754ad8785 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -14,6 +14,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -305,6 +306,39 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif +static inline void compound_lock(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bit_spin_lock(PG_compound_lock, &page->flags); +#endif +} + +static inline void compound_unlock(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bit_spin_unlock(PG_compound_lock, &page->flags); +#endif +} + +static inline unsigned long compound_lock_irqsave(struct page *page) +{ + unsigned long uninitialized_var(flags); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + local_irq_save(flags); + compound_lock(page); +#endif + return flags; +} + +static inline void compound_unlock_irqrestore(struct page *page, + unsigned long flags) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + compound_unlock(page); + local_irq_restore(flags); +#endif +} + static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4805fdec8d6e..5a743cc87238 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -107,6 +107,9 @@ enum pageflags { #endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + PG_compound_lock, #endif __NR_PAGEFLAGS, @@ -397,6 +400,12 @@ static inline void __ClearPageTail(struct page *page) #define __PG_MLOCKED 0 #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define __PG_COMPOUND_LOCK (1 << PG_compound_lock) +#else +#define __PG_COMPOUND_LOCK 0 +#endif + /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. @@ -406,7 +415,8 @@ static inline void __ClearPageTail(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON) + 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \ + __PG_COMPOUND_LOCK) /* * Flags checked when a page is prepped for return by the page allocator. -- cgit v1.2.3-71-gd317 From 9180706344487700b40da9eca5dedd3d11cb33b4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:32 -0800 Subject: thp: alter compound get_page/put_page Alter compound get_page/put_page to keep references on subpages too, in order to allow __split_huge_page_refcount to split an hugepage even while subpages have been pinned by one of the get_user_pages() variants. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/gup.c | 12 +++++++ arch/x86/mm/gup.c | 12 +++++++ include/linux/mm.h | 24 ++++++++++++-- mm/swap.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 129 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index d7efdbf640c7..fec13200868f 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -16,6 +16,16 @@ #ifdef __HAVE_ARCH_PTE_SPECIAL +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -47,6 +57,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(page); return 0; } + if (PageTail(page)) + get_huge_page_tail(page); pages[*nr] = page; (*nr)++; diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 738e6593799d..06f56fcf9a77 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -105,6 +105,16 @@ static inline void get_head_page_multiple(struct page *page, int nr) atomic_add(nr, &page->_count); } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); +} + static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -128,6 +138,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); (*nr)++; page++; refs++; diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b1754ad8785..a2718e1ed585 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -353,9 +353,29 @@ static inline int page_count(struct page *page) static inline void get_page(struct page *page) { - page = compound_head(page); - VM_BUG_ON(atomic_read(&page->_count) == 0); + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. Only if + * we're getting a tail page, the elevated page->_count is + * required only in the head page, so for tail pages the + * bugcheck only verifies that the page->_count isn't + * negative. + */ + VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); atomic_inc(&page->_count); + /* + * Getting a tail page will elevate both the head and tail + * page->_count(s). + */ + if (unlikely(PageTail(page))) { + /* + * This is safe only because + * __split_huge_page_refcount can't run under + * get_page(). + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + atomic_inc(&page->first_page->_count); + } } static inline struct page *virt_to_head_page(const void *x) diff --git a/mm/swap.c b/mm/swap.c index 3f4854205b16..33f5292fe132 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -56,17 +56,93 @@ static void __page_cache_release(struct page *page) del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } +} + +static void __put_single_page(struct page *page) +{ + __page_cache_release(page); free_hot_cold_page(page, 0); } -static void put_compound_page(struct page *page) +static void __put_compound_page(struct page *page) { - page = compound_head(page); - if (put_page_testzero(page)) { - compound_page_dtor *dtor; + compound_page_dtor *dtor; + + __page_cache_release(page); + dtor = get_compound_page_dtor(page); + (*dtor)(page); +} - dtor = get_compound_page_dtor(page); - (*dtor)(page); +static void put_compound_page(struct page *page) +{ + if (unlikely(PageTail(page))) { + /* __split_huge_page_refcount can run under us */ + struct page *page_head = page->first_page; + smp_rmb(); + /* + * If PageTail is still set after smp_rmb() we can be sure + * that the page->first_page we read wasn't a dangling pointer. + * See __split_huge_page_refcount() smp_wmb(). + */ + if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + unsigned long flags; + /* + * Verify that our page_head wasn't converted + * to a a regular page before we got a + * reference on it. + */ + if (unlikely(!PageHead(page_head))) { + /* PageHead is cleared after PageTail */ + smp_rmb(); + VM_BUG_ON(PageTail(page)); + goto out_put_head; + } + /* + * Only run compound_lock on a valid PageHead, + * after having it pinned with + * get_page_unless_zero() above. + */ + smp_mb(); + /* page_head wasn't a dangling pointer */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + VM_BUG_ON(PageHead(page_head)); + out_put_head: + if (put_page_testzero(page_head)) + __put_single_page(page_head); + out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; + } + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by + * get_page_unless_zero now that + * split_huge_page_refcount is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_dec(&page->_count); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) + __put_compound_page(page_head); + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON(PageTail(page)); + goto out_put_single; + } + } else if (put_page_testzero(page)) { + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); } } @@ -75,7 +151,7 @@ void put_page(struct page *page) if (unlikely(PageCompound(page))) put_compound_page(page); else if (put_page_testzero(page)) - __page_cache_release(page); + __put_single_page(page); } EXPORT_SYMBOL(put_page); -- cgit v1.2.3-71-gd317 From 14fd403f2146f740942d78af4e0ee59396ad8eab Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:37 -0800 Subject: thp: export maybe_mkwrite huge_memory.c needs it too when it fallbacks in copying hugepages into regular fragmented pages if hugepage allocation fails during COW. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +++++++++++++ mm/memory.c | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a2718e1ed585..6bef67d74adf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -429,6 +429,19 @@ static inline void set_compound_order(struct page *page, unsigned long order) page[1].lru.prev = (void *)order; } +/* + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when + * servicing faults for write access. In the normal case, do always want + * pte_mkwrite. But get_user_pages can cause write faults for mappings + * that do not have writing enabled, when used by access_process_vm. + */ +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pte = pte_mkwrite(pte); + return pte; +} + /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of diff --git a/mm/memory.c b/mm/memory.c index 1bbe9a22429c..bdf19366b705 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2083,19 +2083,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; } -/* - * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when - * servicing faults for write access. In the normal case, do always want - * pte_mkwrite. But get_user_pages can cause write faults for mappings - * that do not have writing enabled, when used by access_process_vm. - */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) -{ - if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); - return pte; -} - static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* -- cgit v1.2.3-71-gd317 From 8ac1f8320a0073f28cf9e0491af4cd98f504f92a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:43 -0800 Subject: thp: pte alloc trans splitting pte alloc routines must wait for split_huge_page if the pmd is not present and not null (i.e. pmd_trans_splitting). The additional branches are optimized away at compile time by pmd_trans_splitting if the config option is off. However we must pass the vma down in order to know the anon_vma lock to wait for. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/pgd.c | 2 +- arch/ia64/mm/hugetlbpage.c | 2 +- arch/sh/mm/hugetlbpage.c | 2 +- arch/sparc/mm/generic_32.c | 2 +- arch/sparc/mm/generic_64.c | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- arch/um/kernel/skas/mmu.c | 2 +- arch/x86/kernel/tboot.c | 2 +- include/linux/mm.h | 15 +++++++++------ mm/memory.c | 19 +++++++++++++------ mm/mremap.c | 8 +++++--- 11 files changed, 35 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 93292a18cf77..709244c66fa3 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -50,7 +50,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, new_pmd, 0); + new_pte = pte_alloc_map(mm, NULL, new_pmd, 0); if (!new_pte) goto no_pte; diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 1841ee7e65f9..5ca674b74737 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) if (pud) { pmd = pmd_alloc(mm, pud, taddr); if (pmd) - pte = pte_alloc_map(mm, pmd, taddr); + pte = pte_alloc_map(mm, NULL, pmd, taddr); } return pte; } diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 9163db3e8d15..d7762349ea48 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, NULL, pmd, addr); } } diff --git a/arch/sparc/mm/generic_32.c b/arch/sparc/mm/generic_32.c index 5edcac184eaf..e6067b75f11c 100644 --- a/arch/sparc/mm/generic_32.c +++ b/arch/sparc/mm/generic_32.c @@ -50,7 +50,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, NULL, pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); diff --git a/arch/sparc/mm/generic_64.c b/arch/sparc/mm/generic_64.c index 04f2bf4cd571..3cb00dfd4bd6 100644 --- a/arch/sparc/mm/generic_64.c +++ b/arch/sparc/mm/generic_64.c @@ -92,7 +92,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, NULL, pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 5fdddf134caa..f4e97646ce23 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -214,7 +214,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, NULL, pmd, addr); } return pte; } diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 3d099f974785..1aee587e9c5d 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, if (!pmd) goto out_pmd; - pte = pte_alloc_map(mm, pmd, proc); + pte = pte_alloc_map(mm, NULL, pmd, proc); if (!pte) goto out_pte; diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index c2f1b26141e2..998e972f3b1a 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pmd = pmd_alloc(&tboot_mm, pud, vaddr); if (!pmd) return -1; - pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); if (!pte) return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); diff --git a/include/linux/mm.h b/include/linux/mm.h index 6bef67d74adf..14ddd98b063f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1131,7 +1131,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); #endif -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); /* @@ -1200,16 +1201,18 @@ static inline void pgtable_page_dtor(struct page *page) pte_unmap(pte); \ } while (0) -#define pte_alloc_map(mm, pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ - NULL: pte_offset_map(pmd, address)) +#define pte_alloc_map(mm, vma, pmd, address) \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ + pmd, address))? \ + NULL: pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ + pmd, address))? \ NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) extern void free_area_init(unsigned long * zones_size); diff --git a/mm/memory.c b/mm/memory.c index bdf19366b705..567bca80ea53 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address) { pgtable_t new = pte_alloc_one(mm, address); + int wait_split_huge_page; if (!new) return -ENOMEM; @@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ spin_lock(&mm->page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + wait_split_huge_page = 0; + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm->nr_ptes++; pmd_populate(mm, pmd, new); new = NULL; - } + } else if (unlikely(pmd_trans_splitting(*pmd))) + wait_split_huge_page = 1; spin_unlock(&mm->page_table_lock); if (new) pte_free(mm, new); + if (wait_split_huge_page) + wait_split_huge_page(vma->anon_vma, pmd); return 0; } @@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; - } + } else + VM_BUG_ON(pmd_trans_splitting(*pmd)); spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); @@ -3253,7 +3260,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - pte = pte_alloc_map(mm, pmd, address); + pte = pte_alloc_map(mm, vma, pmd, address); if (!pte) return VM_FAULT_OOM; diff --git a/mm/mremap.c b/mm/mremap.c index 563fbdd6293a..b09eefaea0b8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -47,7 +47,8 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return pmd; } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -62,7 +63,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) if (!pmd) return NULL; - if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) + VM_BUG_ON(pmd_trans_huge(*pmd)); + if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) return NULL; return pmd; @@ -147,7 +149,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; - new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); + new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; next = (new_addr + PMD_SIZE) & PMD_MASK; -- cgit v1.2.3-71-gd317 From 91a4ee2670e0ee2b0630a0eb24fb9a87ea3acd0a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:44 -0800 Subject: thp: add pmd mmu_notifier helpers Add mmu notifier helpers to handle pmd huge operations. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 43dcfbdc39de..cbfab1e9957d 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -243,6 +243,32 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __pte; \ }) +#define pmdp_clear_flush_notify(__vma, __address, __pmdp) \ +({ \ + pmd_t __pmd; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ + mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE);\ + __pmd = pmdp_clear_flush(___vma, ___address, __pmdp); \ + mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE); \ + __pmd; \ +}) + +#define pmdp_splitting_flush_notify(__vma, __address, __pmdp) \ +({ \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + VM_BUG_ON(__address & ~HPAGE_PMD_MASK); \ + mmu_notifier_invalidate_range_start(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE);\ + pmdp_splitting_flush(___vma, ___address, __pmdp); \ + mmu_notifier_invalidate_range_end(___vma->vm_mm, ___address, \ + (__address)+HPAGE_PMD_SIZE); \ +}) + #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ @@ -254,6 +280,17 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) __young; \ }) +#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ +({ \ + int __young; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ + __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ + ___address); \ + __young; \ +}) + #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ ({ \ struct mm_struct *___mm = __mm; \ @@ -305,7 +342,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) } #define ptep_clear_flush_young_notify ptep_clear_flush_young +#define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_flush_notify ptep_clear_flush +#define pmdp_clear_flush_notify pmdp_clear_flush +#define pmdp_splitting_flush_notify pmdp_splitting_flush #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ -- cgit v1.2.3-71-gd317 From 4e6af67e970a2ac287739a4c526c857b5bda27ec Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:44 -0800 Subject: thp: clear page compound split_huge_page must transform a compound page to a regular page and needs ClearPageCompound. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Reviewed-by: Christoph Lameter Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5a743cc87238..4835cae71047 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -347,7 +347,7 @@ static inline void set_page_writeback(struct page *page) * tests can be used in performance sensitive paths. PageCompound is * generally not used in hot code paths. */ -__PAGEFLAG(Head, head) +__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) __PAGEFLAG(Tail, tail) static inline int PageCompound(struct page *page) @@ -355,6 +355,13 @@ static inline int PageCompound(struct page *page) return page->flags & ((1L << PG_head) | (1L << PG_tail)); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void ClearPageCompound(struct page *page) +{ + BUG_ON(!PageHead(page)); + ClearPageHead(page); +} +#endif #else /* * Reduce page flag use as much as possible by overlapping @@ -392,6 +399,14 @@ static inline void __ClearPageTail(struct page *page) page->flags &= ~PG_head_tail_mask; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void ClearPageCompound(struct page *page) +{ + BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound)); + clear_bit(PG_compound, &page->flags); +} +#endif + #endif /* !PAGEFLAGS_EXTENDED */ #ifdef CONFIG_MMU -- cgit v1.2.3-71-gd317 From e7a00c45f29c0155007aa150bf231a70fa470365 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:45 -0800 Subject: thp: add pmd_huge_pte to mm_struct This increase the size of the mm struct a bit but it is needed to preallocate one pte for each hugepage so that split_huge_page will not require a fail path. Guarantee of success is a fundamental property of split_huge_page to avoid decrasing swapping reliability and to avoid adding -ENOMEM fail paths that would otherwise force the hugepage-unaware VM code to learn rolling back in the middle of its pte mangling operations (if something we need it to learn handling pmd_trans_huge natively rather being capable of rollback). When split_huge_page runs a pte is needed to succeed the split, to map the newly splitted regular pages with a regular pte. This way all existing VM code remains backwards compatible by just adding a split_huge_page* one liner. The memory waste of those preallocated ptes is negligible and so it is worth it. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 3 +++ kernel/fork.c | 7 +++++++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb7288a782fd..26bc4e2cd275 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -309,6 +309,9 @@ struct mm_struct { #endif #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif /* How many tasks sharing this mm are OOM_DISABLE */ atomic_t oom_disable_count; diff --git a/kernel/fork.c b/kernel/fork.c index 1499607e4da2..f78f50ba6cb2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -529,6 +529,9 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -669,6 +672,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mm->pmd_huge_pte = NULL; +#endif + if (!mm_init(mm, tsk)) goto fail_nomem; -- cgit v1.2.3-71-gd317 From 47ad8475c000141eacb3ecda5e5ce4b43a9cd04d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:47 -0800 Subject: thp: clear_copy_huge_page Move the copy/clear_huge_page functions to common code to share between hugetlb.c and huge_memory.c. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++ mm/hugetlb.c | 70 +++-------------------------------------------------- mm/memory.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 67 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 14ddd98b063f..cc6ab1038f6f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1589,5 +1589,14 @@ static inline int is_hwpoison_address(unsigned long addr) extern void dump_page(struct page *page); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +extern void clear_huge_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page); +extern void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85855240933d..7bf223d6677b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma) return 0; } -static void clear_gigantic_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - struct page *p = page; - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { - cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); - } -} -static void clear_huge_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - - if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, sz); - return; - } - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++) { - cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); - } -} - -static void copy_user_gigantic_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < pages_per_huge_page(h); ) { - cond_resched(); - copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -static void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_user_gigantic_page(dst, src, addr, vma); - return; - } - - might_sleep(); - for (i = 0; i < pages_per_huge_page(h); i++) { - cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); - } -} - static void copy_gigantic_page(struct page *dst, struct page *src) { int i; @@ -2454,7 +2389,8 @@ retry_avoidcopy: return VM_FAULT_OOM; } - copy_user_huge_page(new_page, old_page, address, vma); + copy_user_huge_page(new_page, old_page, address, vma, + pages_per_huge_page(h)); __SetPageUptodate(new_page); /* @@ -2558,7 +2494,7 @@ retry: ret = -PTR_ERR(page); goto out; } - clear_huge_page(page, address, huge_page_size(h)); + clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_MAYSHARE) { diff --git a/mm/memory.c b/mm/memory.c index 567bca80ea53..60e1c68d8218 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3645,3 +3645,74 @@ void might_fault(void) } EXPORT_SYMBOL(might_fault); #endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; + i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} +void clear_huge_page(struct page *page, + unsigned long addr, unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } +} + +static void copy_user_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, + struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page; ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + copy_user_gigantic_page(dst, src, addr, vma, + pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -- cgit v1.2.3-71-gd317 From 936a5fe6e6148c0b3ea0d792b903847d9b9931a1 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:48 -0800 Subject: thp: kvm mmu transparent hugepage support This should work for both hugetlbfs and transparent hugepages. [akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability] Signed-off-by: Andrea Arcangeli Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kvm/mmu.c | 91 ++++++++++++++++++++++++++++++++++++++-------- arch/x86/kvm/paging_tmpl.h | 9 ++++- include/linux/page-flags.h | 12 ++++++ virt/kvm/kvm_main.c | 32 +++++++++++++++- 4 files changed, 125 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9cafbb499813..47b2c3288b6b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level, level, max_level; - slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) - return PT_PAGE_TABLE_LEVEL; + return true; + return false; +} + +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + int host_level, level, max_level; host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) return 1; } +static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + gfn_t *gfnp, pfn_t *pfnp, int *levelp) +{ + pfn_t pfn = *pfnp; + gfn_t gfn = *gfnp; + int level = *levelp; + + /* + * Check if it's a transparent hugepage. If this would be an + * hugetlbfs page, level wouldn't be set to + * PT_PAGE_TABLE_LEVEL and there would be no adjustment done + * here. + */ + if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + level == PT_PAGE_TABLE_LEVEL && + PageTransCompound(pfn_to_page(pfn)) && + !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { + unsigned long mask; + /* + * mmu_notifier_retry was successful and we hold the + * mmu_lock here, so the pmd can't become splitting + * from under us, and in turn + * __split_huge_page_refcount() can't run from under + * us and we can safely transfer the refcount from + * PG_tail to PG_head as we switch the pfn to tail to + * head. + */ + *levelp = level = PT_DIRECTORY_LEVEL; + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { + gfn &= ~mask; + *gfnp = gfn; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + if (!get_page_unless_zero(pfn_to_page(pfn))) + BUG(); + *pfnp = pfn; + } + } +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); @@ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, { int r; int level; + int force_pt_level; pfn_t pfn; unsigned long mmu_seq; bool map_writable; - level = mapping_level(vcpu, gfn); - - /* - * This path builds a PAE pagetable - so we can map 2mb pages at - * maximum. Therefore check if the level is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; + force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (likely(!force_pt_level)) { + level = mapping_level(vcpu, gfn); + /* + * This path builds a PAE pagetable - so we can map + * 2mb pages at maximum. Therefore check if the level + * is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } else + level = PT_PAGE_TABLE_LEVEL; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); @@ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, pfn_t pfn; int r; int level; + int force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - level = mapping_level(vcpu, gfn); - - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (likely(!force_pt_level)) { + level = mapping_level(vcpu, gfn); + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } else + level = PT_PAGE_TABLE_LEVEL; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 53210f1e94c2..6bccc24c4181 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; + int force_pt_level; unsigned long mmu_seq; bool map_writable; @@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } - if (walker.level >= PT_DIRECTORY_LEVEL) { + if (walker.level >= PT_DIRECTORY_LEVEL) + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + else + force_pt_level = 1; + if (!force_pt_level) { level = min(walker.level, mapping_level(vcpu, walker.gfn)); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } @@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); + if (!force_pt_level) + transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn, map_writable, prefault); (void)sptep; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4835cae71047..907f1605926b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int PageTransCompound(struct page *page) +{ + return PageCompound(page); +} +#else +static inline int PageTransCompound(struct page *page) +{ + return 0; +} +#endif + #ifdef CONFIG_MMU #define __PG_MLOCKED (1 << PG_mlocked) #else diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7f686251f711..85ab7db0d366 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -104,8 +104,36 @@ static pfn_t fault_pfn; inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { - struct page *page = compound_head(pfn_to_page(pfn)); - return PageReserved(page); + struct page *head; + struct page *tail = pfn_to_page(pfn); + head = compound_head(tail); + if (head != tail) { + smp_rmb(); + /* + * head may be a dangling pointer. + * __split_huge_page_refcount clears PageTail + * before overwriting first_page, so if + * PageTail is still there it means the head + * pointer isn't dangling. + */ + if (PageTail(tail)) { + /* + * the "head" is not a dangling + * pointer but the hugepage may have + * been splitted from under us (and we + * may not hold a reference count on + * the head page so it can be reused + * before we run PageReferenced), so + * we've to recheck PageTail before + * returning what we just read. + */ + int reserved = PageReserved(head); + smp_rmb(); + if (PageTail(tail)) + return reserved; + } + } + return PageReserved(tail); } return true; -- cgit v1.2.3-71-gd317 From 32dba98e085f8b2b4345887df9abf5e0e93bfc12 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:49 -0800 Subject: thp: _GFP_NO_KSWAPD Transparent hugepage allocations must be allowed not to invoke kswapd or any other kind of indirect reclaim (especially when the defrag sysfs is control disabled). It's unacceptable to swap out anonymous pages (potentially anonymous transparent hugepages) in order to create new transparent hugepages. This is true for the MADV_HUGEPAGE areas too (swapping out a kvm virtual machine and so having it suffer an unbearable slowdown, so another one with guest physical memory marked MADV_HUGEPAGE can run 30% faster if it is running memory intensive workloads, makes no sense). If a transparent hugepage allocation fails the slowdown is minor and there is total fallback, so kswapd should never be asked to swapout memory to allow the high order allocation to succeed. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 5 ++++- mm/page_alloc.c | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f54adfcbec9c..49d2356bb82d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -34,6 +34,7 @@ struct vm_area_struct; #else #define ___GFP_NOTRACK 0 #endif +#define ___GFP_NO_KSWAPD 0x400000u /* * GFP bitmasks.. @@ -81,13 +82,15 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ +#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) + /* * This may seem redundant, but it's a way of annotating false positives vs. * allocations that simply cannot be supported (e.g. page tables). */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for 23 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e62d5f9d40b..bbd0423f2820 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2027,7 +2027,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(preferred_zone)); /* -- cgit v1.2.3-71-gd317 From 71e3aac0724ffe8918992d76acfe3aad7d8724a5 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:52 -0800 Subject: thp: transparent hugepage core Lately I've been working to make KVM use hugepages transparently without the usual restrictions of hugetlbfs. Some of the restrictions I'd like to see removed: 1) hugepages have to be swappable or the guest physical memory remains locked in RAM and can't be paged out to swap 2) if a hugepage allocation fails, regular pages should be allocated instead and mixed in the same vma without any failure and without userland noticing 3) if some task quits and more hugepages become available in the buddy, guest physical memory backed by regular pages should be relocated on hugepages automatically in regions under madvise(MADV_HUGEPAGE) (ideally event driven by waking up the kernel deamon if the order=HPAGE_PMD_SHIFT-PAGE_SHIFT list becomes not null) 4) avoidance of reservation and maximization of use of hugepages whenever possible. Reservation (needed to avoid runtime fatal faliures) may be ok for 1 machine with 1 database with 1 database cache with 1 database cache size known at boot time. It's definitely not feasible with a virtualization hypervisor usage like RHEV-H that runs an unknown number of virtual machines with an unknown size of each virtual machine with an unknown amount of pagecache that could be potentially useful in the host for guest not using O_DIRECT (aka cache=off). hugepages in the virtualization hypervisor (and also in the guest!) are much more important than in a regular host not using virtualization, becasue with NPT/EPT they decrease the tlb-miss cacheline accesses from 24 to 19 in case only the hypervisor uses transparent hugepages, and they decrease the tlb-miss cacheline accesses from 19 to 15 in case both the linux hypervisor and the linux guest both uses this patch (though the guest will limit the addition speedup to anonymous regions only for now...). Even more important is that the tlb miss handler is much slower on a NPT/EPT guest than for a regular shadow paging or no-virtualization scenario. So maximizing the amount of virtual memory cached by the TLB pays off significantly more with NPT/EPT than without (even if there would be no significant speedup in the tlb-miss runtime). The first (and more tedious) part of this work requires allowing the VM to handle anonymous hugepages mixed with regular pages transparently on regular anonymous vmas. This is what this patch tries to achieve in the least intrusive possible way. We want hugepages and hugetlb to be used in a way so that all applications can benefit without changes (as usual we leverage the KVM virtualization design: by improving the Linux VM at large, KVM gets the performance boost too). The most important design choice is: always fallback to 4k allocation if the hugepage allocation fails! This is the _very_ opposite of some large pagecache patches that failed with -EIO back then if a 64k (or similar) allocation failed... Second important decision (to reduce the impact of the feature on the existing pagetable handling code) is that at any time we can split an hugepage into 512 regular pages and it has to be done with an operation that can't fail. This way the reliability of the swapping isn't decreased (no need to allocate memory when we are short on memory to swap) and it's trivial to plug a split_huge_page* one-liner where needed without polluting the VM. Over time we can teach mprotect, mremap and friends to handle pmd_trans_huge natively without calling split_huge_page*. The fact it can't fail isn't just for swap: if split_huge_page would return -ENOMEM (instead of the current void) we'd need to rollback the mprotect from the middle of it (ideally including undoing the split_vma) which would be a big change and in the very wrong direction (it'd likely be simpler not to call split_huge_page at all and to teach mprotect and friends to handle hugepages instead of rolling them back from the middle). In short the very value of split_huge_page is that it can't fail. The collapsing and madvise(MADV_HUGEPAGE) part will remain separated and incremental and it'll just be an "harmless" addition later if this initial part is agreed upon. It also should be noted that locking-wise replacing regular pages with hugepages is going to be very easy if compared to what I'm doing below in split_huge_page, as it will only happen when page_count(page) matches page_mapcount(page) if we can take the PG_lock and mmap_sem in write mode. collapse_huge_page will be a "best effort" that (unlike split_huge_page) can fail at the minimal sign of trouble and we can try again later. collapse_huge_page will be similar to how KSM works and the madvise(MADV_HUGEPAGE) will work similar to madvise(MADV_MERGEABLE). The default I like is that transparent hugepages are used at page fault time. This can be changed with /sys/kernel/mm/transparent_hugepage/enabled. The control knob can be set to three values "always", "madvise", "never" which mean respectively that hugepages are always used, or only inside madvise(MADV_HUGEPAGE) regions, or never used. /sys/kernel/mm/transparent_hugepage/defrag instead controls if the hugepage allocation should defrag memory aggressively "always", only inside "madvise" regions, or "never". The pmd_trans_splitting/pmd_trans_huge locking is very solid. The put_page (from get_user_page users that can't use mmu notifier like O_DIRECT) that runs against a __split_huge_page_refcount instead was a pain to serialize in a way that would result always in a coherent page count for both tail and head. I think my locking solution with a compound_lock taken only after the page_first is valid and is still a PageHead should be safe but it surely needs review from SMP race point of view. In short there is no current existing way to serialize the O_DIRECT final put_page against split_huge_page_refcount so I had to invent a new one (O_DIRECT loses knowledge on the mapping status by the time gup_fast returns so...). And I didn't want to impact all gup/gup_fast users for now, maybe if we change the gup interface substantially we can avoid this locking, I admit I didn't think too much about it because changing the gup unpinning interface would be invasive. If we ignored O_DIRECT we could stick to the existing compound refcounting code, by simply adding a get_user_pages_fast_flags(foll_flags) where KVM (and any other mmu notifier user) would call it without FOLL_GET (and if FOLL_GET isn't set we'd just BUG_ON if nobody registered itself in the current task mmu notifier list yet). But O_DIRECT is fundamental for decent performance of virtualized I/O on fast storage so we can't avoid it to solve the race of put_page against split_huge_page_refcount to achieve a complete hugepage feature for KVM. Swap and oom works fine (well just like with regular pages ;). MMU notifier is handled transparently too, with the exception of the young bit on the pmd, that didn't have a range check but I think KVM will be fine because the whole point of hugepages is that EPT/NPT will also use a huge pmd when they notice gup returns pages with PageCompound set, so they won't care of a range and there's just the pmd young bit to check in that case. NOTE: in some cases if the L2 cache is small, this may slowdown and waste memory during COWs because 4M of memory are accessed in a single fault instead of 8k (the payoff is that after COW the program can run faster). So we might want to switch the copy_huge_page (and clear_huge_page too) to not temporal stores. I also extensively researched ways to avoid this cache trashing with a full prefault logic that would cow in 8k/16k/32k/64k up to 1M (I can send those patches that fully implemented prefault) but I concluded they're not worth it and they add an huge additional complexity and they remove all tlb benefits until the full hugepage has been faulted in, to save a little bit of memory and some cache during app startup, but they still don't improve substantially the cache-trashing during startup if the prefault happens in >4k chunks. One reason is that those 4k pte entries copied are still mapped on a perfectly cache-colored hugepage, so the trashing is the worst one can generate in those copies (cow of 4k page copies aren't so well colored so they trashes less, but again this results in software running faster after the page fault). Those prefault patches allowed things like a pte where post-cow pages were local 4k regular anon pages and the not-yet-cowed pte entries were pointing in the middle of some hugepage mapped read-only. If it doesn't payoff substantially with todays hardware it will payoff even less in the future with larger l2 caches, and the prefault logic would blot the VM a lot. If one is emebdded transparent_hugepage can be disabled during boot with sysfs or with the boot commandline parameter transparent_hugepage=0 (or transparent_hugepage=2 to restrict hugepages inside madvise regions) that will ensure not a single hugepage is allocated at boot time. It is simple enough to just disable transparent hugepage globally and let transparent hugepages be allocated selectively by applications in the MADV_HUGEPAGE region (both at page fault time, and if enabled with the collapse_huge_page too through the kernel daemon). This patch supports only hugepages mapped in the pmd, archs that have smaller hugepages will not fit in this patch alone. Also some archs like power have certain tlb limits that prevents mixing different page size in the same regions so they will not fit in this framework that requires "graceful fallback" to basic PAGE_SIZE in case of physical memory fragmentation. hugetlbfs remains a perfect fit for those because its software limits happen to match the hardware limits. hugetlbfs also remains a perfect fit for hugepage sizes like 1GByte that cannot be hoped to be found not fragmented after a certain system uptime and that would be very expensive to defragment with relocation, so requiring reservation. hugetlbfs is the "reservation way", the point of transparent hugepages is not to have any reservation at all and maximizing the use of cache and hugepages at all times automatically. Some performance result: vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largep ages3 memset page fault 1566023 memset tlb miss 453854 memset second tlb miss 453321 random access tlb miss 41635 random access second tlb miss 41658 vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largepages3 memset page fault 1566471 memset tlb miss 453375 memset second tlb miss 453320 random access tlb miss 41636 random access second tlb miss 41637 vmx andrea # ./largepages3 memset page fault 1566642 memset tlb miss 453417 memset second tlb miss 453313 random access tlb miss 41630 random access second tlb miss 41647 vmx andrea # ./largepages3 memset page fault 1566872 memset tlb miss 453418 memset second tlb miss 453315 random access tlb miss 41618 random access second tlb miss 41659 vmx andrea # echo 0 > /proc/sys/vm/transparent_hugepage vmx andrea # ./largepages3 memset page fault 2182476 memset tlb miss 460305 memset second tlb miss 460179 random access tlb miss 44483 random access second tlb miss 44186 vmx andrea # ./largepages3 memset page fault 2182791 memset tlb miss 460742 memset second tlb miss 459962 random access tlb miss 43981 random access second tlb miss 43988 ============ #include #include #include #include #define SIZE (3UL*1024*1024*1024) int main() { char *p = malloc(SIZE), *p2; struct timeval before, after; gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset page fault %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset second tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); for (p2 = p; p2 < p+SIZE; p2 += 4096) *p2 = 0; gettimeofday(&after, NULL); printf("random access tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); for (p2 = p; p2 < p+SIZE; p2 += 4096) *p2 = 0; gettimeofday(&after, NULL); printf("random access second tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); return 0; } ============ Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64.h | 5 + include/linux/gfp.h | 3 + include/linux/huge_mm.h | 118 +++++ include/linux/mm.h | 4 + include/linux/mm_inline.h | 11 +- include/linux/page-flags.h | 21 + include/linux/rmap.h | 2 + include/linux/swap.h | 2 + mm/Makefile | 1 + mm/huge_memory.c | 901 ++++++++++++++++++++++++++++++++++++++ mm/internal.h | 4 + mm/memory.c | 84 +++- mm/rmap.c | 62 ++- mm/swap.c | 37 ++ 14 files changed, 1220 insertions(+), 35 deletions(-) create mode 100644 include/linux/huge_mm.h create mode 100644 mm/huge_memory.c (limited to 'include/linux') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1fb61a74b2e1..b2df039a4119 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -286,6 +286,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_RW); } +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 49d2356bb82d..d95082cc6f4a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -109,6 +109,9 @@ struct vm_area_struct; __GFP_HARDWALL | __GFP_HIGHMEM | \ __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) +#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NO_KSWAPD) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h new file mode 100644 index 000000000000..9301824c7491 --- /dev/null +++ b/include/linux/huge_mm.h @@ -0,0 +1,118 @@ +#ifndef _LINUX_HUGE_MM_H +#define _LINUX_HUGE_MM_H + +extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags); +extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma); +extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pmd_t orig_pmd); +extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm); +extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags); +extern int zap_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd); + +enum transparent_hugepage_flag { + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, +#ifdef CONFIG_DEBUG_VM + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, +#endif +}; + +enum page_check_address_pmd_flag { + PAGE_CHECK_ADDRESS_PMD_FLAG, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, +}; +extern pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define HPAGE_PMD_SHIFT HPAGE_SHIFT +#define HPAGE_PMD_MASK HPAGE_MASK +#define HPAGE_PMD_SIZE HPAGE_SIZE + +#define transparent_hugepage_enabled(__vma) \ + (transparent_hugepage_flags & (1<vm_flags & VM_HUGEPAGE)) +#define transparent_hugepage_defrag(__vma) \ + ((transparent_hugepage_flags & \ + (1<vm_flags & VM_HUGEPAGE)) +#ifdef CONFIG_DEBUG_VM +#define transparent_hugepage_debug_cow() \ + (transparent_hugepage_flags & \ + (1<root->lock); \ + /* \ + * spin_unlock_wait() is just a loop in C and so the \ + * CPU can reorder anything around it. \ + */ \ + smp_mb(); \ + BUG_ON(pmd_trans_splitting(*____pmd) || \ + pmd_trans_huge(*____pmd)); \ + } while (0) +#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) +#define HPAGE_PMD_NR (1< MAX_ORDER +#error "hugepages can't be allocated by the buddy allocator" +#endif +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define HPAGE_PMD_SHIFT ({ BUG(); 0; }) +#define HPAGE_PMD_MASK ({ BUG(); 0; }) +#define HPAGE_PMD_SIZE ({ BUG(); 0; }) + +#define transparent_hugepage_enabled(__vma) 0 + +#define transparent_hugepage_flags 0UL +static inline int split_huge_page(struct page *page) +{ + return 0; +} +#define split_huge_page_pmd(__mm, __pmd) \ + do { } while (0) +#define wait_split_huge_page(__anon_vma, __pmd) \ + do { } while (0) +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index cc6ab1038f6f..78adec4ba9f4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -111,6 +111,9 @@ extern unsigned int kobjsize(const void *objp); #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#if BITS_PER_LONG > 32 +#define VM_HUGEPAGE 0x100000000UL /* MADV_HUGEPAGE marked this vma */ +#endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) @@ -243,6 +246,7 @@ struct inode; * files which need it (119 of them) */ #include +#include /* * Methods to modify the page usage count. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8835b877b8db..650f31eabdb1 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -20,13 +20,20 @@ static inline int page_is_file_cache(struct page *page) } static inline void -add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, + struct list_head *head) { - list_add(&page->lru, &zone->lru[l].list); + list_add(&page->lru, head); __inc_zone_state(zone, NR_LRU_BASE + l); mem_cgroup_add_lru_list(page, l); } +static inline void +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +{ + __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); +} + static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 907f1605926b..4ca1241ef94e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -410,11 +410,32 @@ static inline void ClearPageCompound(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * PageHuge() only returns true for hugetlbfs pages, but not for + * normal or transparent huge pages. + * + * PageTransHuge() returns true for both transparent huge and + * hugetlbfs pages, but not normal pages. PageTransHuge() can only be + * called only in the core VM paths where hugetlbfs pages can't exist. + */ +static inline int PageTransHuge(struct page *page) +{ + VM_BUG_ON(PageTail(page)); + return PageHead(page); +} + static inline int PageTransCompound(struct page *page) { return PageCompound(page); } + #else + +static inline int PageTransHuge(struct page *page) +{ + return 0; +} + static inline int PageTransCompound(struct page *page) { return 0; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bb83c0da2071..e9fd04ca1e51 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -198,6 +198,8 @@ enum ttu_flags { }; #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) +bool is_vma_temporary_stack(struct vm_area_struct *vma); + int try_to_unmap(struct page *, enum ttu_flags flags); int try_to_unmap_one(struct page *, struct vm_area_struct *, unsigned long address, enum ttu_flags flags); diff --git a/include/linux/swap.h b/include/linux/swap.h index eba53e71d2cc..4d559325d919 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void); /* linux/mm/swap.c */ extern void __lru_cache_add(struct page *, enum lru_list lru); extern void lru_cache_add_lru(struct page *, enum lru_list lru); +extern void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); diff --git a/mm/Makefile b/mm/Makefile index 380772a9ccb8..2b1b575ae712 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 000000000000..0c1e8f939f7c --- /dev/null +++ b/mm/huge_memory.c @@ -0,0 +1,901 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +unsigned long transparent_hugepage_flags __read_mostly = + (1<page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + if (list_empty(&pgtable->lru)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} + +static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + struct page *page, + unsigned long haddr) +{ + pgtable_t pgtable; + pmd_t _pmd; + int ret = 0, i; + struct page **pages; + + pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, + GFP_KERNEL); + if (unlikely(!pages)) { + ret |= VM_FAULT_OOM; + goto out; + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); + if (unlikely(!pages[i])) { + while (--i >= 0) + put_page(pages[i]); + kfree(pages); + ret |= VM_FAULT_OOM; + goto out; + } + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + copy_user_highpage(pages[i], page + i, + haddr + PAGE_SHIFT*i, vma); + __SetPageUptodate(pages[i]); + cond_resched(); + } + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_free_pages; + VM_BUG_ON(!PageHead(page)); + + pmdp_clear_flush_notify(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = mk_pte(pages[i], vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_add_new_anon_rmap(pages[i], vma, haddr); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + kfree(pages); + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + page_remove_rmap(page); + spin_unlock(&mm->page_table_lock); + + ret |= VM_FAULT_WRITE; + put_page(page); + +out: + return ret; + +out_free_pages: + spin_unlock(&mm->page_table_lock); + for (i = 0; i < HPAGE_PMD_NR; i++) + put_page(pages[i]); + kfree(pages); + goto out; +} + +int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +{ + int ret = 0; + struct page *page, *new_page; + unsigned long haddr; + + VM_BUG_ON(!vma->anon_vma); + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_unlock; + + page = pmd_page(orig_pmd); + VM_BUG_ON(!PageCompound(page) || !PageHead(page)); + haddr = address & HPAGE_PMD_MASK; + if (page_mapcount(page) == 1) { + pmd_t entry; + entry = pmd_mkyoung(orig_pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) + update_mmu_cache(vma, address, entry); + ret |= VM_FAULT_WRITE; + goto out_unlock; + } + get_page(page); + spin_unlock(&mm->page_table_lock); + + if (transparent_hugepage_enabled(vma) && + !transparent_hugepage_debug_cow()) + new_page = alloc_hugepage(transparent_hugepage_defrag(vma)); + else + new_page = NULL; + + if (unlikely(!new_page)) { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + put_page(page); + goto out; + } + + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + __SetPageUptodate(new_page); + + spin_lock(&mm->page_table_lock); + put_page(page); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + put_page(new_page); + else { + pmd_t entry; + VM_BUG_ON(!PageHead(page)); + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + pmdp_clear_flush_notify(vma, haddr, pmd); + page_add_new_anon_rmap(new_page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache(vma, address, entry); + page_remove_rmap(page); + put_page(page); + ret |= VM_FAULT_WRITE; + } +out_unlock: + spin_unlock(&mm->page_table_lock); +out: + return ret; +} + +struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct page *page = NULL; + + assert_spin_locked(&mm->page_table_lock); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + goto out; + + page = pmd_page(*pmd); + VM_BUG_ON(!PageHead(page)); + if (flags & FOLL_TOUCH) { + pmd_t _pmd; + /* + * We should set the dirty bit only for FOLL_WRITE but + * for now the dirty bit in the pmd is meaningless. + * And if the dirty bit will become meaningful and + * we'll only set it with FOLL_WRITE, an atomic + * set_bit will be required on the pmd to set the + * young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + } + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + VM_BUG_ON(!PageCompound(page)); + if (flags & FOLL_GET) + get_page(page); + +out: + return page; +} + +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd) +{ + int ret = 0; + + spin_lock(&tlb->mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&tlb->mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, + pmd); + } else { + struct page *page; + pgtable_t pgtable; + pgtable = get_pmd_huge_pte(tlb->mm); + page = pmd_page(*pmd); + pmd_clear(pmd); + page_remove_rmap(page); + VM_BUG_ON(page_mapcount(page) < 0); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON(!PageHead(page)); + spin_unlock(&tlb->mm->page_table_lock); + tlb_remove_page(tlb, page); + pte_free(tlb->mm, pgtable); + ret = 1; + } + } else + spin_unlock(&tlb->mm->page_table_lock); + + return ret; +} + +pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, *ret = NULL; + + if (address & ~HPAGE_PMD_MASK) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + if (pmd_page(*pmd) != page) + goto out; + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)); + if (pmd_trans_huge(*pmd)) { + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && + !pmd_trans_splitting(*pmd)); + ret = pmd; + } +out: + return ret; +} + +static int __split_huge_page_splitting(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd; + int ret = 0; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + if (pmd) { + /* + * We can't temporarily set the pmd to null in order + * to split it, the pmd must remain marked huge at all + * times or the VM won't take the pmd_trans_huge paths + * and it won't wait on the anon_vma->root->lock to + * serialize against split_huge_page*. + */ + pmdp_splitting_flush_notify(vma, address, pmd); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +static void __split_huge_page_refcount(struct page *page) +{ + int i; + unsigned long head_index = page->index; + struct zone *zone = page_zone(page); + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + compound_lock(page); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + + /* tail_page->_count cannot change */ + atomic_sub(atomic_read(&page_tail->_count), &page->_count); + BUG_ON(page_count(page) <= 0); + atomic_add(page_mapcount(page) + 1, &page_tail->_count); + BUG_ON(atomic_read(&page_tail->_count) <= 0); + + /* after clearing PageTail the gup refcount can be released */ + smp_mb(); + + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (page->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate))); + page_tail->flags |= (1L << PG_dirty); + + /* + * 1) clear PageTail before overwriting first_page + * 2) clear PageTail before clearing PageHead for VM_BUG_ON + */ + smp_wmb(); + + /* + * __split_huge_page_splitting() already set the + * splitting bit in all pmd that could map this + * hugepage, that will ensure no CPU can alter the + * mapcount on the head page. The mapcount is only + * accounted in the head page and it has to be + * transferred to all tail pages in the below code. So + * for this code to be safe, the split the mapcount + * can't change. But that doesn't mean userland can't + * keep changing and reading the page contents while + * we transfer the mapcount, so the pmd splitting + * status is achieved setting a reserved bit in the + * pmd, not by clearing the present bit. + */ + BUG_ON(page_mapcount(page_tail)); + page_tail->_mapcount = page->_mapcount; + + BUG_ON(page_tail->mapping); + page_tail->mapping = page->mapping; + + page_tail->index = ++head_index; + + BUG_ON(!PageAnon(page_tail)); + BUG_ON(!PageUptodate(page_tail)); + BUG_ON(!PageDirty(page_tail)); + BUG_ON(!PageSwapBacked(page_tail)); + + lru_add_page_tail(zone, page, page_tail); + } + + ClearPageCompound(page); + compound_unlock(page); + spin_unlock_irq(&zone->lru_lock); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + BUG_ON(page_count(page_tail) <= 0); + /* + * Tail pages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(page_tail); + } + + /* + * Only the head page (now become a regular page) is required + * to be pinned by the caller. + */ + BUG_ON(page_count(page) <= 0); +} + +static int __split_huge_page_map(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd, _pmd; + int ret = 0, i; + pgtable_t pgtable; + unsigned long haddr; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + if (pmd) { + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, haddr = address; i < HPAGE_PMD_NR; + i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + BUG_ON(PageCompound(page+i)); + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (!pmd_write(*pmd)) + entry = pte_wrprotect(entry); + else + BUG_ON(page_mapcount(page) != 1); + if (!pmd_young(*pmd)) + entry = pte_mkold(entry); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + /* + * Up to this point the pmd is present and huge and + * userland has the whole access to the hugepage + * during the split (which happens in place). If we + * overwrite the pmd with the not-huge version + * pointing to the pte here (which of course we could + * if all CPUs were bug free), userland could trigger + * a small page size TLB miss on the small sized TLB + * while the hugepage TLB entry is still established + * in the huge TLB. Some CPU doesn't like that. See + * http://support.amd.com/us/Processor_TechDocs/41322.pdf, + * Erratum 383 on page 93. Intel should be safe but is + * also warns that it's only safe if the permission + * and cache attributes of the two entries loaded in + * the two TLB is identical (which should be the case + * here). But it is generally safer to never allow + * small and huge TLB entries for the same virtual + * address to be loaded simultaneously. So instead of + * doing "pmd_populate(); flush_tlb_range();" we first + * mark the current pmd notpresent (atomically because + * here the pmd_trans_huge and pmd_trans_splitting + * must remain set at all times on the pmd until the + * split is complete for this pmd), then we flush the + * SMP TLB and finally we write the non-huge version + * of the pmd entry with pmd_populate. + */ + set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmd_populate(mm, pmd, pgtable); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +/* must be called with anon_vma->root->lock hold */ +static void __split_huge_page(struct page *page, + struct anon_vma *anon_vma) +{ + int mapcount, mapcount2; + struct anon_vma_chain *avc; + + BUG_ON(!PageHead(page)); + BUG_ON(PageTail(page)); + + mapcount = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount += __split_huge_page_splitting(page, vma, addr); + } + BUG_ON(mapcount != page_mapcount(page)); + + __split_huge_page_refcount(page); + + mapcount2 = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount2 += __split_huge_page_map(page, vma, addr); + } + BUG_ON(mapcount != mapcount2); +} + +int split_huge_page(struct page *page) +{ + struct anon_vma *anon_vma; + int ret = 1; + + BUG_ON(!PageAnon(page)); + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + goto out; + ret = 0; + if (!PageCompound(page)) + goto out_unlock; + + BUG_ON(!PageSwapBacked(page)); + __split_huge_page(page, anon_vma); + + BUG_ON(PageCompound(page)); +out_unlock: + page_unlock_anon_vma(anon_vma); +out: + return ret; +} + +void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct page *page; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(&mm->page_table_lock); + return; + } + page = pmd_page(*pmd); + VM_BUG_ON(!page_count(page)); + get_page(page); + spin_unlock(&mm->page_table_lock); + + split_huge_page(page); + + put_page(page); + BUG_ON(pmd_trans_huge(*pmd)); +} diff --git a/mm/internal.h b/mm/internal.h index bd4f581f624a..69488205723d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern unsigned long vma_address(struct page *page, + struct vm_area_struct *vma); +#endif #else /* !CONFIG_MMU */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { diff --git a/mm/memory.c b/mm/memory.c index 60e1c68d8218..c50a195041ec 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -726,9 +726,9 @@ out_set_pte: return 0; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; @@ -802,6 +802,16 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*src_pmd)) { + int err; + err = copy_huge_pmd(dst_mm, src_mm, + dst_pmd, src_pmd, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, @@ -1004,6 +1014,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next-addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma->vm_mm, pmd); + else if (zap_huge_pmd(tlb, vma, pmd)) { + (*zap_work)--; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) { (*zap_work)--; continue; @@ -1280,11 +1299,27 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if (pmd_trans_huge(*pmd)) { + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(mm, address, + pmd, flags); + spin_unlock(&mm->page_table_lock); + goto out; + } + } else + spin_unlock(&mm->page_table_lock); + /* fall through */ + } if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -3179,9 +3214,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, unsigned int flags) +int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -3260,9 +3295,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - pte = pte_alloc_map(mm, vma, pmd, address); - if (!pte) + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + if (!vma->vm_ops) + return do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + } else { + pmd_t orig_pmd = *pmd; + barrier(); + if (pmd_trans_huge(orig_pmd)) { + if (flags & FAULT_FLAG_WRITE && + !pmd_write(orig_pmd) && + !pmd_trans_splitting(orig_pmd)) + return do_huge_pmd_wp_page(mm, vma, address, + pmd, orig_pmd); + return 0; + } + } + + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); return handle_pte_fault(mm, vma, address, pte, pmd, flags); } diff --git a/mm/rmap.c b/mm/rmap.c index a3197a8a295b..e41375a6b029 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -360,7 +360,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) * Returns virtual address or -EFAULT if page's index/offset is not * within the range mapped the @vma. */ -static inline unsigned long +inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -435,6 +435,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return NULL; + if (pmd_trans_huge(*pmd)) + return NULL; pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ @@ -489,35 +491,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - pte_t *pte; - spinlock_t *ptl; int referenced = 0; - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) - goto out; - /* * Don't want to elevate referenced for mlocked page that gets this far, * in order that it progresses to try_to_unmap and is moved to the * unevictable list. */ if (vma->vm_flags & VM_LOCKED) { - *mapcount = 1; /* break early from loop */ + *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; - goto out_unmap; - } - - if (ptep_clear_flush_young_notify(vma, address, pte)) { - /* - * Don't treat a reference through a sequentially read - * mapping as such. If the page has been used in - * another mapping, we will catch it; if this other - * mapping is already gone, the unmap path will have - * set PG_referenced or activated the page. - */ - if (likely(!VM_SequentialReadHint(vma))) - referenced++; + goto out; } /* Pretend the page is referenced if the task has the @@ -526,9 +510,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, rwsem_is_locked(&mm->mmap_sem)) referenced++; -out_unmap: + if (unlikely(PageTransHuge(page))) { + pmd_t *pmd; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_FLAG); + if (pmd && !pmd_trans_splitting(*pmd) && + pmdp_clear_flush_young_notify(vma, address, pmd)) + referenced++; + spin_unlock(&mm->page_table_lock); + } else { + pte_t *pte; + spinlock_t *ptl; + + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + goto out; + + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!VM_SequentialReadHint(vma))) + referenced++; + } + pte_unmap_unlock(pte, ptl); + } + (*mapcount)--; - pte_unmap_unlock(pte, ptl); if (referenced) *vm_flags |= vma->vm_flags; @@ -1202,7 +1216,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -static bool is_vma_temporary_stack(struct vm_area_struct *vma) +bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); diff --git a/mm/swap.c b/mm/swap.c index e0eeef940886..c02f93611a84 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -479,6 +479,43 @@ void __pagevec_release(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_release); +/* used by __split_huge_page_refcount() */ +void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail) +{ + int active; + enum lru_list lru; + const int file = 0; + struct list_head *head; + + VM_BUG_ON(!PageHead(page)); + VM_BUG_ON(PageCompound(page_tail)); + VM_BUG_ON(PageLRU(page_tail)); + VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + + SetPageLRU(page_tail); + + if (page_evictable(page_tail, NULL)) { + if (PageActive(page)) { + SetPageActive(page_tail); + active = 1; + lru = LRU_ACTIVE_ANON; + } else { + active = 0; + lru = LRU_INACTIVE_ANON; + } + update_page_reclaim_stat(zone, page_tail, file, active); + if (likely(PageLRU(page))) + head = page->lru.prev; + else + head = &zone->lru[lru].list; + __add_page_to_lru_list(zone, page_tail, lru, head); + } else { + SetPageUnevictable(page_tail); + add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + } +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. -- cgit v1.2.3-71-gd317 From 0af4e98b6b095c74588af04872f83d333c958c32 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:55 -0800 Subject: thp: madvise(MADV_HUGEPAGE) Add madvise MADV_HUGEPAGE to mark regions that are important to be hugepage backed. Return -EINVAL if the vma is not of an anonymous type, or the feature isn't built into the kernel. Never silently return success. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 6 ++++++ mm/huge_memory.c | 16 ++++++++++++++++ mm/madvise.c | 8 ++++++++ 3 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9301824c7491..d9ab70d776e2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -97,6 +97,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #if HPAGE_PMD_ORDER > MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif +extern int hugepage_madvise(unsigned long *vm_flags); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -113,6 +114,11 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) +static inline int hugepage_madvise(unsigned long *vm_flags) +{ + BUG(); + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 763507932898..620891f4e54f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -896,6 +896,22 @@ out: return ret; } +int hugepage_madvise(unsigned long *vm_flags) +{ + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + + *vm_flags |= VM_HUGEPAGE; + + return 0; +} + void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) { struct page *page; diff --git a/mm/madvise.c b/mm/madvise.c index 319528b8db74..ecde40a401c1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -71,6 +71,11 @@ static long madvise_behavior(struct vm_area_struct * vma, if (error) goto out; break; + case MADV_HUGEPAGE: + error = hugepage_madvise(&new_flags); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -282,6 +287,9 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: #endif return 1; -- cgit v1.2.3-71-gd317 From 500d65d471018d9a13b0d51b7e141ed2a3555c1d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:55 -0800 Subject: thp: pmd_trans_huge migrate bugcheck No pmd_trans_huge should ever materialize in migration ptes areas, because we split the hugepage before migration ptes are instantiated. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory.c | 5 +++++ mm/migrate.c | 7 ++++++- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 78adec4ba9f4..2ec5138badab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1490,6 +1490,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_MLOCK 0x40 /* mark page as mlocked */ +#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/memory.c b/mm/memory.c index c1a80e00458d..12ee1ea237f5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1305,6 +1305,10 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto out; } if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(mm, pmd); + goto split_fallthrough; + } spin_lock(&mm->page_table_lock); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { @@ -1320,6 +1324,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, spin_unlock(&mm->page_table_lock); /* fall through */ } +split_fallthrough: if (unlikely(pmd_bad(*pmd))) goto no_page_table; diff --git a/mm/migrate.c b/mm/migrate.c index 690d0de993af..1a531b760b3b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto out; pmd = pmd_offset(pud, addr); + if (pmd_trans_huge(*pmd)) + goto out; if (!pmd_present(*pmd)) goto out; @@ -632,6 +634,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* page was freed from under us. So we are done. */ goto move_newpage; } + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) + goto move_newpage; /* prepare cgroup just returns 0 or -ENOMEM */ rc = -EAGAIN; @@ -1063,7 +1068,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET); + page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); err = PTR_ERR(page); if (IS_ERR(page)) -- cgit v1.2.3-71-gd317 From 79134171df238171daa4c024a42b77b401ccb00b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: transparent hugepage vmstat Add hugepage stat information to /proc/vmstat and /proc/meminfo. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 14 +++++++++++++- include/linux/mmzone.h | 1 + mm/huge_memory.c | 3 +++ mm/rmap.c | 20 ++++++++++++++++---- mm/vmstat.c | 1 + 5 files changed, 34 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a65239cfd97e..ed257d141568 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -100,6 +100,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "VmallocChunk: %8lu kB\n" #ifdef CONFIG_MEMORY_FAILURE "HardwareCorrupted: %5lu kB\n" +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "AnonHugePages: %8lu kB\n" #endif , K(i.totalram), @@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), + K(global_page_state(NR_ANON_PAGES) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR +#endif + ), K(global_page_state(NR_FILE_MAPPED)), K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + @@ -150,6 +158,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) vmi.largest_chunk >> 10 #ifdef CONFIG_MEMORY_FAILURE ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) #endif ); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dad3612a7e39..02ecb0189b1d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,7 @@ enum zone_stat_item { NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ #endif + NR_ANON_TRANSPARENT_HUGEPAGES, NR_VM_ZONE_STAT_ITEMS }; /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a313403b3c5e..7101112a5429 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -751,6 +751,9 @@ static void __split_huge_page_refcount(struct page *page) lru_add_page_tail(zone, page, page_tail); } + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/rmap.c b/mm/rmap.c index 92e14dcfe737..3825ae4bc32f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -882,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { int first = atomic_inc_and_test(&page->_mapcount); - if (first) - __inc_zone_page_state(page, NR_ANON_PAGES); + if (first) { + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); + } if (unlikely(PageKsm(page))) return; @@ -911,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - __inc_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); @@ -964,7 +972,11 @@ void page_remove_rmap(struct page *page) return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_PAGES); + else + __dec_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_update_file_mapped(page, -1); diff --git a/mm/vmstat.c b/mm/vmstat.c index 751a65e00aac..0c3b5048773e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -880,6 +880,7 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_anon_transparent_hugepages", "nr_dirty_threshold", "nr_dirty_background_threshold", -- cgit v1.2.3-71-gd317 From ba76149f47d8c939efa0acc07a191237af900471 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: khugepaged Add khugepaged to relocate fragmented pages into hugepages if new hugepages become available. (this is indipendent of the defrag logic that will have to make new hugepages available) The fundamental reason why khugepaged is unavoidable, is that some memory can be fragmented and not everything can be relocated. So when a virtual machine quits and releases gigabytes of hugepages, we want to use those freely available hugepages to create huge-pmd in the other virtual machines that may be running on fragmented memory, to maximize the CPU efficiency at all times. The scan is slow, it takes nearly zero cpu time, except when it copies data (in which case it means we definitely want to pay for that cpu time) so it seems a good tradeoff. In addition to the hugepages being released by other process releasing memory, we have the strong suspicion that the performance impact of potentially defragmenting hugepages during or before each page fault could lead to more performance inconsistency than allocating small pages at first and having them collapsed into large pages later... if they prove themselfs to be long lived mappings (khugepaged scan is slow so short lived mappings have low probability to run into khugepaged if compared to long lived mappings). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 1 + include/linux/khugepaged.h | 66 +++ include/linux/sched.h | 1 + kernel/fork.c | 5 + mm/huge_memory.c | 1073 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 1136 insertions(+), 10 deletions(-) create mode 100644 include/linux/khugepaged.h (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d9ab70d776e2..43a694ef8904 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -25,6 +25,7 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, #ifdef CONFIG_DEBUG_VM TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, #endif diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h new file mode 100644 index 000000000000..552f3184756c --- /dev/null +++ b/include/linux/khugepaged.h @@ -0,0 +1,66 @@ +#ifndef _LINUX_KHUGEPAGED_H +#define _LINUX_KHUGEPAGED_H + +#include /* MMF_VM_HUGEPAGE */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern int __khugepaged_enter(struct mm_struct *mm); +extern void __khugepaged_exit(struct mm_struct *mm); +extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); + +#define khugepaged_enabled() \ + (transparent_hugepage_flags & \ + ((1<flags)) + return __khugepaged_enter(mm); + return 0; +} + +static inline void khugepaged_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + __khugepaged_exit(mm); +} + +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) + if (khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) + if (__khugepaged_enter(vma->vm_mm)) + return -ENOMEM; + return 0; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + return 0; +} +static inline void khugepaged_exit(struct mm_struct *mm) +{ +} +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + return 0; +} +static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f23b5bb6f52e..d747f948b34e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -434,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm); #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/kernel/fork.c b/kernel/fork.c index f78f50ba6cb2..25e429152ddc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -328,6 +329,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); if (retval) goto out; @@ -546,6 +550,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7101112a5429..ae2bf08b1099 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -12,14 +12,111 @@ #include #include #include +#include +#include +#include #include #include #include "internal.h" +/* + * By default transparent hugepage support is enabled for all mappings + * and khugepaged scans all mappings. Defrag is only invoked by + * khugepaged hugepage allocations and by page faults inside + * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived + * allocations. + */ unsigned long transparent_hugepage_flags __read_mostly = - (1< 0) { + int err = start_khugepaged(); + if (err) + ret = err; + } + + return ret; } static struct kobj_attribute enabled_attr = __ATTR(enabled, 0644, enabled_show, enabled_store); @@ -153,20 +260,212 @@ static struct attribute *hugepage_attr[] = { static struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, - .name = "transparent_hugepage", +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = strict_strtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = strict_strtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + NULL, +}; + +static struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", }; #endif /* CONFIG_SYSFS */ static int __init hugepage_init(void) { -#ifdef CONFIG_SYSFS int err; +#ifdef CONFIG_SYSFS + static struct kobject *hugepage_kobj; - err = sysfs_create_group(mm_kobj, &hugepage_attr_group); - if (err) - printk(KERN_ERR "hugepage: register sysfs failed\n"); + err = -ENOMEM; + hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!hugepage_kobj)) { + printk(KERN_ERR "hugepage: failed kobject create\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } #endif - return 0; + + err = khugepaged_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) { + khugepaged_slab_free(); + goto out; + } + + start_khugepaged(); + +out: + return err; } module_init(hugepage_init) @@ -285,6 +584,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; page = alloc_hugepage(transparent_hugepage_defrag(vma)); if (unlikely(!page)) goto out; @@ -941,6 +1242,758 @@ int hugepage_madvise(unsigned long *vm_flags) return 0; } +static int __init khugepaged_slab_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + return 0; +} + +static void __init khugepaged_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + mm_slot_cache = NULL; +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +#if 0 +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); + mm_slots_hash = NULL; +} +#endif + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, hash) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + hlist_add_head(&mm_slot->hash, bucket); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON(khugepaged_test_exit(mm)); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_file || vma->vm_ops) + /* khugepaged not yet working on file or special mappings */ + return 0; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + + if (free) { + spin_unlock(&khugepaged_mm_lock); + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + spin_unlock(&khugepaged_mm_lock); + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } else + spin_unlock(&khugepaged_mm_lock); +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval)) + release_pte_page(pte_page(pteval)); + } +} + +static void release_all_pte_pages(pte_t *pte) +{ + release_pte_pages(pte, pte + HPAGE_PMD_NR); +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page; + pte_t *_pte; + int referenced = 0, isolated = 0, none = 0; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else { + release_pte_pages(pte, _pte); + goto out; + } + } + if (!pte_present(pteval) || !pte_write(pteval)) { + release_pte_pages(pte, _pte); + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) { + release_pte_pages(pte, _pte); + goto out; + } + VM_BUG_ON(PageCompound(page)); + BUG_ON(!PageAnon(page)); + VM_BUG_ON(!PageSwapBacked(page)); + + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + release_pte_pages(pte, _pte); + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageLRU(page)); + + /* If there is no mapped pte young don't collapse the page */ + if (pte_young(pteval)) + referenced = 1; + } + if (unlikely(!referenced)) + release_all_pte_pages(pte); + else + isolated = 1; +out: + return isolated; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval)) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON(page_mapcount(src_page) != 1); + VM_BUG_ON(page_count(src_page) != 2); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage) +{ + struct vm_area_struct *vma; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(!*hpage); + + /* + * Prevent all access to pagetables with the exception of + * gup_fast later hanlded by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + vma = find_vma(mm, address); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + goto out; + + if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + goto out; + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto out; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + /* pmd can't go away or become huge under us */ + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + new_page = *hpage; + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + goto out; + + anon_vma_lock(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + ptl = pte_lockptr(mm, pmd); + + spin_lock(&mm->page_table_lock); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_clear_flush_notify(vma, address, pmd); + spin_unlock(&mm->page_table_lock); + + spin_lock(ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(ptl); + pte_unmap(pte); + + if (unlikely(!isolated)) { + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + set_pmd_at(mm, address, pmd, _pmd); + spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma->anon_vma); + mem_cgroup_uncharge_page(new_page); + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + VM_BUG_ON(page_count(pgtable) != 1); + VM_BUG_ON(page_mapcount(pgtable) != 0); + + _pmd = mk_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + _pmd = pmd_mkhuge(_pmd); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache(vma, address, entry); + prepare_pmd_huge_pte(pgtable, mm); + mm->nr_ptes--; + spin_unlock(&mm->page_table_lock); + + *hpage = NULL; + khugepaged_pages_collapsed++; +out: + up_write(&mm->mmap_sem); +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, referenced = 0, none = 0; + struct page *page; + unsigned long _address; + spinlock_t *ptl; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else + goto out_unmap; + } + if (!pte_present(pteval) || !pte_write(pteval)) + goto out_unmap; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + goto out_unmap; + VM_BUG_ON(PageCompound(page)); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + goto out_unmap; + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) + goto out_unmap; + if (pte_young(pteval)) + referenced = 1; + } + if (referenced) + ret = 1; +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) { + up_read(&mm->mmap_sem); + collapse_huge_page(mm, address, hpage); + } +out: + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + + if (!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) { + progress++; + continue; + } + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { + khugepaged_scan.address = vma->vm_end; + progress++; + continue; + } + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) { + progress++; + continue; + } + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + if (khugepaged_scan.address > hend) { + khugepaged_scan.address = hend + HPAGE_PMD_SIZE; + progress++; + continue; + } + BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + !khugepaged_enabled(); +} + +static void khugepaged_do_scan(struct page **hpage) +{ + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + cond_resched(); + + if (!*hpage) { + *hpage = alloc_hugepage(khugepaged_defrag()); + if (unlikely(!*hpage)) + break; + } + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } +} + +static struct page *khugepaged_alloc_hugepage(void) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } + } while (unlikely(!hpage) && + likely(khugepaged_enabled())); + return hpage; +} + +static void khugepaged_loop(void) +{ + struct page *hpage; + + while (likely(khugepaged_enabled())) { + hpage = khugepaged_alloc_hugepage(); + if (unlikely(!hpage)) + break; + + khugepaged_do_scan(&hpage); + if (hpage) + put_page(hpage); + if (khugepaged_has_work()) { + DEFINE_WAIT(wait); + if (!khugepaged_scan_sleep_millisecs) + continue; + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_scan_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } else if (khugepaged_enabled()) + wait_event_interruptible(khugepaged_wait, + khugepaged_wait_event()); + } +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_user_nice(current, 19); + + /* serialize with start_khugepaged() */ + mutex_lock(&khugepaged_mutex); + + for (;;) { + mutex_unlock(&khugepaged_mutex); + BUG_ON(khugepaged_thread != current); + khugepaged_loop(); + BUG_ON(khugepaged_thread != current); + + mutex_lock(&khugepaged_mutex); + if (!khugepaged_enabled()) + break; + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + + khugepaged_thread = NULL; + mutex_unlock(&khugepaged_mutex); + + return 0; +} + void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) { struct page *page; -- cgit v1.2.3-71-gd317 From 5f24ce5fd34c3ca1b3d10d30da754732da64d5c0 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:00 -0800 Subject: thp: remove PG_buddy PG_buddy can be converted to _mapcount == -2. So the PG_compound_lock can be added to page->flags without overflowing (because of the sparse section bits increasing) with CONFIG_X86_PAE=y and CONFIG_X86_PAT=y. This also has to move the memory hotplug code from _mapcount to lru.next to avoid any risk of clashes. We can't use lru.next for PG_buddy removal, but memory hotplug can use lru.next even more easily than the mapcount instead. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 14 ++++++++------ include/linux/memory_hotplug.h | 14 +++++++++----- include/linux/mm.h | 21 +++++++++++++++++++++ include/linux/page-flags.h | 7 +------ mm/memory_hotplug.c | 14 ++++++++------ mm/page_alloc.c | 7 +++---- mm/sparse.c | 4 ++-- 7 files changed, 52 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/page.c b/fs/proc/page.c index b06c674624e6..6d8e6a9e93ab 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page) if (PageHuge(page)) u |= 1 << KPF_HUGE; - u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - /* - * Caveats on high order pages: - * PG_buddy will only be set on the head page; SLUB/SLQB do the same - * for PG_slab; SLOB won't set PG_slab at all on compound pages. + * Caveats on high order pages: page->_count will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 31c237a00c48..24376fe7ee68 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -13,12 +13,16 @@ struct mem_section; #ifdef CONFIG_MEMORY_HOTPLUG /* - * Types for free bootmem. - * The normal smallest mapcount is -1. Here is smaller value than it. + * Types for free bootmem stored in page->lru.next. These have to be in + * some random range in unsigned long space for debugging purposes. */ -#define SECTION_INFO (-1 - 1) -#define MIX_SECTION_INFO (-1 - 2) -#define NODE_INFO (-1 - 3) +enum { + MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, + SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, + MIX_SECTION_INFO, + NODE_INFO, + MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, +}; /* * pgdat resizing functions diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ec5138badab..7ab7d2b60041 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -397,6 +397,27 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } +/* + * PageBuddy() indicate that the page is free and in the buddy system + * (see mm/page_alloc.c). + */ +static inline int PageBuddy(struct page *page) +{ + return atomic_read(&page->_mapcount) == -2; +} + +static inline void __SetPageBuddy(struct page *page) +{ + VM_BUG_ON(atomic_read(&page->_mapcount) != -1); + atomic_set(&page->_mapcount, -2); +} + +static inline void __ClearPageBuddy(struct page *page) +{ + VM_BUG_ON(!PageBuddy(page)); + atomic_set(&page->_mapcount, -1); +} + void put_page(struct page *page); void put_pages_list(struct list_head *pages); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4ca1241ef94e..0db8037e2725 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -48,9 +48,6 @@ * struct page (these bits with information) are always mapped into kernel * address space... * - * PG_buddy is set to indicate that the page is free and in the buddy system - * (see mm/page_alloc.c). - * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! @@ -96,7 +93,6 @@ enum pageflags { PG_swapcache, /* Swap page: swp_entry_t in private */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ - PG_buddy, /* Page is free, on buddy lists */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ #ifdef CONFIG_MMU @@ -233,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) -__PAGEFLAG(Buddy, buddy) PAGEFLAG(MappedToDisk, mappedtodisk) /* PG_readahead is only used for file reads; PG_reclaim is only for writes */ @@ -461,7 +456,7 @@ static inline int PageTransCompound(struct page *page) #define PAGE_FLAGS_CHECK_AT_FREE \ (1 << PG_lru | 1 << PG_locked | \ 1 << PG_private | 1 << PG_private_2 | \ - 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ + 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \ __PG_COMPOUND_LOCK) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a2832c092509..e92f04749fcb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, int type) +static void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { - atomic_set(&page->_mapcount, type); + page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); atomic_inc(&page->_count); @@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) * so use __ref to tell modpost not to generate a warning */ void __ref put_page_bootmem(struct page *page) { - int type; + unsigned long type; - type = atomic_read(&page->_mapcount); - BUG_ON(type >= -1); + type = (unsigned long) page->lru.next; + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (atomic_dec_return(&page->_count) == 1) { ClearPagePrivate(page); set_page_private(page, 0); - reset_page_mapcount(page); + INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e7664b9f706c..9dfe49bceff4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -449,8 +449,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we use PG_buddy. - * Setting, clearing, and testing PG_buddy is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount -2. + * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -483,7 +483,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_buddy. Page's + * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -5574,7 +5574,6 @@ static struct trace_print_flags pageflag_names[] = { {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_buddy, "buddy" }, {1UL << PG_swapbacked, "swapbacked" }, {1UL << PG_unevictable, "unevictable" }, #ifdef CONFIG_MMU diff --git a/mm/sparse.c b/mm/sparse.c index 95ac219af379..93250207c5cf 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) static void free_map_bootmem(struct page *page, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; - int magic; + unsigned long magic; for (i = 0; i < nr_pages; i++, page++) { - magic = atomic_read(&page->_mapcount); + magic = (unsigned long) page->lru.next; BUG_ON(magic == NODE_INFO); -- cgit v1.2.3-71-gd317 From f2d6bfe9ff0acec30b713614260e78b03d20e909 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:01 -0800 Subject: thp: add x86 32bit support Add support for transparent hugepages to x86 32bit. Share the same VM_ bitflag for VM_MAPPED_COPY. mm/nommu.c will never support transparent hugepages. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-2level.h | 9 +++ arch/x86/include/asm/pgtable-3level.h | 23 +++++++ arch/x86/include/asm/pgtable.h | 117 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/pgtable_64.h | 109 ------------------------------- arch/x86/mm/pgtable.c | 4 +- include/linux/mm.h | 7 +- mm/Kconfig | 2 +- 7 files changed, 156 insertions(+), 115 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 2334982b339e..98391db840c6 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) +{ + return __pmd(xchg((pmdval_t *)xp, 0)); +} +#else +#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#endif + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, * split up the 29 bits of offset into this range: diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 177b0165ea01..94b979d1b58d 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +union split_pmd { + struct { + u32 pmd_low; + u32 pmd_high; + }; + pmd_t pmd; +}; +static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) +{ + union split_pmd res, *orig = (union split_pmd *)pmdp; + + /* xchg acts as a barrier before setting of the high bits */ + res.pmd_low = xchg(&orig->pmd_low, 0); + res.pmd_high = orig->pmd_high; + orig->pmd_high = 0; + + return res.pmd; +} +#else +#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#endif + /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3278038e9706..001a3831567a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -97,6 +97,11 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +static inline int pmd_young(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_ACCESSED; +} + static inline int pte_write(pte_t pte) { return pte_flags(pte) & _PAGE_RW; @@ -145,6 +150,18 @@ static inline int pmd_large(pmd_t pte) (_PAGE_PSE | _PAGE_PRESENT); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_SPLITTING; +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_PSE; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static inline pte_t pte_set_flags(pte_t pte, pteval_t set) { pteval_t v = native_pte_val(pte); @@ -219,6 +236,55 @@ static inline pte_t pte_mkspecial(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL); } +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v & ~clear); +} + +static inline pmd_t pmd_mkold(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mkdirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DIRTY); +} + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_PSE); +} + +static inline pmd_t pmd_mkyoung(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_mkwrite(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. @@ -527,6 +593,14 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) return res; } +static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp) +{ + pmd_t res = *pmdp; + + native_pmd_clear(pmdp); + return res; +} + static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep , pte_t pte) { @@ -616,6 +690,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address) +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) + +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); + +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +extern int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + + +#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_RW; +} + +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + pmd_update(mm, addr, pmdp); + return pmd; +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); + pmd_update(mm, addr, pmdp); +} + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index b2df039a4119..975f709e09ae 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -182,115 +182,6 @@ extern void cleanup_highmap(void); #define __HAVE_ARCH_PTE_SAME -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_trans_splitting(pmd_t pmd) -{ - return pmd_val(pmd) & _PAGE_SPLITTING; -} - -static inline int pmd_trans_huge(pmd_t pmd) -{ - return pmd_val(pmd) & _PAGE_PSE; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) - -#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS -extern int pmdp_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp, - pmd_t entry, int dirty); - -#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); - -#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); - - -#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH -extern void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); - -#define __HAVE_ARCH_PMD_WRITE -static inline int pmd_write(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_RW; -} - -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - pmd_t pmd = native_pmdp_get_and_clear(pmdp); - pmd_update(mm, addr, pmdp); - return pmd; -} - -#define __HAVE_ARCH_PMDP_SET_WRPROTECT -static inline void pmdp_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - clear_bit(_PAGE_BIT_RW, (unsigned long *)&pmdp->pmd); - pmd_update(mm, addr, pmdp); -} - -static inline int pmd_young(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_ACCESSED; -} - -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v | set); -} - -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v & ~clear); -} - -static inline pmd_t pmd_mkold(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_ACCESSED); -} - -static inline pmd_t pmd_wrprotect(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_RW); -} - -static inline pmd_t pmd_mkdirty(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_DIRTY); -} - -static inline pmd_t pmd_mkhuge(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_PSE); -} - -static inline pmd_t pmd_mkyoung(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_ACCESSED); -} - -static inline pmd_t pmd_mkwrite(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_RW); -} - -static inline pmd_t pmd_mknotpresent(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_PRESENT); -} - #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 65e92d58f942..500242d3c96d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -362,7 +362,7 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, - (unsigned long *) &pmdp->pmd); + (unsigned long *)pmdp); if (ret) pmd_update(vma->vm_mm, addr, pmdp); @@ -404,7 +404,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, int set; VM_BUG_ON(address & ~HPAGE_PMD_MASK); set = !test_and_set_bit(_PAGE_BIT_SPLITTING, - (unsigned long *)&pmdp->pmd); + (unsigned long *)pmdp); if (set) { pmd_update(vma->vm_mm, address, pmdp); /* need tlb flush only to serialize against gup-fast */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ab7d2b60041..9c2695beab86 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -102,7 +102,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ +#ifndef CONFIG_TRANSPARENT_HUGEPAGE #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#else +#define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */ +#endif #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ @@ -111,9 +115,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ -#if BITS_PER_LONG > 32 -#define VM_HUGEPAGE 0x100000000UL /* MADV_HUGEPAGE marked this vma */ -#endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) diff --git a/mm/Kconfig b/mm/Kconfig index 3982be2d721a..d774f77538ce 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -304,7 +304,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" if EMBEDDED - depends on X86_64 && MMU + depends on X86 && MMU default y help Transparent Hugepages allows the kernel to use huge pages and -- cgit v1.2.3-71-gd317 From 0ca1634d4143c3579273ca53b993df19f5c98e92 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:02 -0800 Subject: thp: mincore transparent hugepage support Handle transparent huge page pmd entries natively instead of splitting them into subpages. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 +++ mm/huge_memory.c | 25 +++++++++++++++++++++++++ mm/mincore.c | 8 +++++++- 3 files changed, 35 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 43a694ef8904..25125fb6acf7 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd); +extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ae2bf08b1099..37e89a32a0b1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -923,6 +923,31 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return ret; } +int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + int ret = 0; + + spin_lock(&vma->vm_mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + ret = !pmd_trans_splitting(*pmd); + spin_unlock(&vma->vm_mm->page_table_lock); + if (unlikely(!ret)) + wait_split_huge_page(vma->anon_vma, pmd); + else { + /* + * All logical pages in the range are present + * if backed by a huge page. + */ + memset(vec, 1, (end - addr) >> PAGE_SHIFT); + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, diff --git a/mm/mincore.c b/mm/mincore.c index 9959bb41570e..a4e6b9d75c76 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -154,7 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma->vm_mm, pmd); + if (pmd_trans_huge(*pmd)) { + if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { + vec += (next - addr) >> PAGE_SHIFT; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else -- cgit v1.2.3-71-gd317 From cd7548ab360c462118568eebb8c6da3bc303b02e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:04 -0800 Subject: thp: mprotect: transparent huge page support Natively handle huge pmds when changing page tables on behalf of mprotect(). I left out update_mmu_cache() because we do not need it on x86 anyway but more importantly the interface works on ptes, not pmds. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 ++ mm/huge_memory.c | 27 +++++++++++++++++++++++++++ mm/mprotect.c | 8 +++++++- 3 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 25125fb6acf7..c590b08c6fa6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -22,6 +22,8 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); +extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 37e89a32a0b1..7b55fe0e998b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -948,6 +948,33 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, return ret; } +int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + pmd_t entry; + + entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + set_pmd_at(mm, addr, pmd, entry); + spin_unlock(&vma->vm_mm->page_table_lock); + flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + ret = 1; + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, diff --git a/mm/mprotect.c b/mm/mprotect.c index 9402080d0d4a..5a688a2756be 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -88,7 +88,13 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma->vm_mm, pmd); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma->vm_mm, pmd); + else if (change_huge_pmd(vma, pmd, addr, newprot)) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) continue; change_pte_range(vma->vm_mm, pmd, addr, next, newprot, -- cgit v1.2.3-71-gd317 From 0bbbc0b33d141f78a0d9218a54a47f50621220d3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:05 -0800 Subject: thp: add numa awareness to hugepage allocations It's mostly a matter of replacing alloc_pages with alloc_pages_vma after introducing alloc_pages_vma. khugepaged needs special handling as the allocation has to happen inside collapse_huge_page where the vma is known and an error has to be returned to the outer loop to sleep alloc_sleep_millisecs in case of failure. But it retains the more efficient logic of handling allocation failures in khugepaged in case of CONFIG_NUMA=n. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 7 +++-- mm/huge_memory.c | 87 +++++++++++++++++++++++++++++++++++++++++++++-------- mm/mempolicy.c | 13 +++++--- 3 files changed, 87 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d95082cc6f4a..a3b148a91874 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -331,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_current(gfp_mask, order); } -extern struct page *alloc_page_vma(gfp_t gfp_mask, +extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#define alloc_pages_vma(gfp_mask, order, vma, addr) \ + alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) +#define alloc_page_vma(gfp_mask, vma, addr) \ + alloc_pages_vma(gfp_mask, 0, vma, addr) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0415a83afd66..f6559e7711bd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -620,11 +620,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return ret; } +static inline gfp_t alloc_hugepage_gfpmask(int defrag) +{ + return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); +} + +static inline struct page *alloc_hugepage_vma(int defrag, + struct vm_area_struct *vma, + unsigned long haddr) +{ + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), + HPAGE_PMD_ORDER, vma, haddr); +} + +#ifndef CONFIG_NUMA static inline struct page *alloc_hugepage(int defrag) { - return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT), + return alloc_pages(alloc_hugepage_gfpmask(defrag), HPAGE_PMD_ORDER); } +#endif int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, @@ -639,7 +654,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; - page = alloc_hugepage(transparent_hugepage_defrag(vma)); + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); if (unlikely(!page)) goto out; if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { @@ -862,7 +878,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) - new_page = alloc_hugepage(transparent_hugepage_defrag(vma)); + new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); else new_page = NULL; @@ -1661,7 +1678,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long hstart, hend; VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); +#else + VM_BUG_ON(*hpage); +#endif /* * Prevent all access to pagetables with the exception of @@ -1699,9 +1720,17 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; +#ifndef CONFIG_NUMA new_page = *hpage; - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) +#else + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + *hpage = ERR_PTR(-ENOMEM); goto out; + } +#endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + goto out_put_page; anon_vma_lock(vma->anon_vma); @@ -1730,7 +1759,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); mem_cgroup_uncharge_page(new_page); - goto out; + goto out_put_page; } /* @@ -1765,10 +1794,19 @@ static void collapse_huge_page(struct mm_struct *mm, mm->nr_ptes--; spin_unlock(&mm->page_table_lock); +#ifndef CONFIG_NUMA *hpage = NULL; +#endif khugepaged_pages_collapsed++; out: up_write(&mm->mmap_sem); + return; + +out_put_page: +#ifdef CONFIG_NUMA + put_page(new_page); +#endif + goto out; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -2001,11 +2039,16 @@ static void khugepaged_do_scan(struct page **hpage) while (progress < pages) { cond_resched(); +#ifndef CONFIG_NUMA if (!*hpage) { *hpage = alloc_hugepage(khugepaged_defrag()); if (unlikely(!*hpage)) break; } +#else + if (IS_ERR(*hpage)) + break; +#endif spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) @@ -2020,37 +2063,55 @@ static void khugepaged_do_scan(struct page **hpage) } } +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +#ifndef CONFIG_NUMA static struct page *khugepaged_alloc_hugepage(void) { struct page *hpage; do { hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) { - DEFINE_WAIT(wait); - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); - } + if (!hpage) + khugepaged_alloc_sleep(); } while (unlikely(!hpage) && likely(khugepaged_enabled())); return hpage; } +#endif static void khugepaged_loop(void) { struct page *hpage; +#ifdef CONFIG_NUMA + hpage = NULL; +#endif while (likely(khugepaged_enabled())) { +#ifndef CONFIG_NUMA hpage = khugepaged_alloc_hugepage(); if (unlikely(!hpage)) break; +#else + if (IS_ERR(hpage)) { + khugepaged_alloc_sleep(); + hpage = NULL; + } +#endif khugepaged_do_scan(&hpage); +#ifndef CONFIG_NUMA if (hpage) put_page(hpage); +#endif if (khugepaged_has_work()) { DEFINE_WAIT(wait); if (!khugepaged_scan_sleep_millisecs) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 83b7df309fc4..368fc9d23610 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_page_vma - Allocate a page for a VMA. + * alloc_pages_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. @@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * + * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * Should be called with the mm_sem of the vma hold. */ struct page * -alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - page = alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); return page; } @@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, 0, + struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); put_mems_allowed(); @@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * fast path: default or task policy */ - page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, order, zl, + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } -- cgit v1.2.3-71-gd317 From 94fcc585fb85ad7b059c70872489b50044d401f3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:08 -0800 Subject: thp: avoid breaking huge pmd invariants in case of vma_adjust failures An huge pmd can only be mapped if the corresponding 2M virtual range is fully contained in the vma. At times the VM calls split_vma twice, if the first split_vma succeeds and the second fail, the first split_vma remains in effect and it's not rolled back. For split_vma or vma_adjust to fail an allocation failure is needed so it's a very unlikely event (the out of memory killer would normally fire before any allocation failure is visible to kernel and userland and if an out of memory condition happens it's unlikely to happen exactly here). Nevertheless it's safer to ensure that no huge pmd can be left around if the vma is adjusted in a way that can't fit hugepages anymore at the new vm_start/vm_end address. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 19 ++++++++++++ mm/huge_memory.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-- mm/mmap.c | 2 ++ 3 files changed, 99 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c590b08c6fa6..827595228734 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -104,6 +104,19 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #error "hugepages can't be allocated by the buddy allocator" #endif extern int hugepage_madvise(unsigned long *vm_flags); +extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next); +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + return; + __vma_adjust_trans_huge(vma, start, end, adjust_next); +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -125,6 +138,12 @@ static inline int hugepage_madvise(unsigned long *vm_flags) BUG(); return 0; } +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 30c3cec82023..b6facc35e893 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1075,8 +1075,16 @@ pmd_t *page_check_address_pmd(struct page *page, goto out; if (pmd_page(*pmd) != page) goto out; - VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && - pmd_trans_splitting(*pmd)); + /* + * split_vma() may create temporary aliased mappings. There is + * no risk as long as all huge pmd are found and have their + * splitting bit set before __split_huge_page_refcount + * runs. Finding the same huge pmd more than once during the + * same rmap walk is not a problem. + */ + if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)) + goto out; if (pmd_trans_huge(*pmd)) { VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && !pmd_trans_splitting(*pmd)); @@ -2196,3 +2204,71 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) put_page(page); BUG_ON(pmd_trans_huge(*pmd)); } + +static void split_huge_page_address(struct mm_struct *mm, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return; + /* + * Caller holds the mmap_sem write mode, so a huge pmd cannot + * materialize from under us. + */ + split_huge_page_pmd(mm, pmd); +} + +void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + /* + * If the new start address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (start & ~HPAGE_PMD_MASK && + (start & HPAGE_PMD_MASK) >= vma->vm_start && + (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, start); + + /* + * If the new end address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (end & ~HPAGE_PMD_MASK && + (end & HPAGE_PMD_MASK) >= vma->vm_start && + (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, end); + + /* + * If we're also updating the vma->vm_next->vm_start, if the new + * vm_next->vm_start isn't page aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (adjust_next > 0) { + struct vm_area_struct *next = vma->vm_next; + unsigned long nstart = next->vm_start; + nstart += adjust_next << PAGE_SHIFT; + if (nstart & ~HPAGE_PMD_MASK && + (nstart & HPAGE_PMD_MASK) >= next->vm_start && + (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) + split_huge_page_address(next->vm_mm, nstart); + } +} diff --git a/mm/mmap.c b/mm/mmap.c index 753f44d17047..73cc648873d6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -589,6 +589,8 @@ again: remove_next = 1 + (end > next->vm_end); } } + vma_adjust_trans_huge(vma, start, end, adjust_next); + /* * When changing only vma->vm_end, we don't really need anon_vma * lock. This is a fairly rare case by itself, but the anon_vma -- cgit v1.2.3-71-gd317 From 8ee53820edfd1f3b6554c593f337148dd3d7fc91 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:10 -0800 Subject: thp: mmu_notifier_test_young For GRU and EPT, we need gup-fast to set referenced bit too (this is why it's correct to return 0 when shadow_access_mask is zero, it requires gup-fast to set the referenced bit). qemu-kvm access already sets the young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow paging EPT minor fault we relay on gup-fast to signal the page is in use... We also need to check the young bits on the secondary pagetables for NPT and not nested shadow mmu as the data may never get accessed again by the primary pte. Without this closer accuracy, we'd have to remove the heuristic that avoids collapsing hugepages in hugepage virtual regions that have not even a single subpage in use. ->test_young is full backwards compatible with GRU and other usages that don't have young bits in pagetables set by the hardware and that should nuke the secondary mmu mappings when ->clear_flush_young runs just like EPT does. Removing the heuristic that checks the young bit in khugepaged/collapse_huge_page completely isn't so bad either probably but I thought it was worth it and this makes it reliable. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/gup.c | 3 +++ include/linux/mmu_notifier.h | 26 ++++++++++++++++++++++++++ mm/huge_memory.c | 6 ++++-- mm/mmu_notifier.c | 20 ++++++++++++++++++++ virt/kvm/kvm_main.c | 17 +++++++++++++++++ 7 files changed, 105 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aa75f21a9fba..ffd7f8d29187 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -822,6 +822,7 @@ extern bool kvm_rebooting; #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 47b2c3288b6b..f02b8edc3d44 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -945,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return young; } +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + u64 *spte; + int young = 0; + + /* + * If there's no access bit in the secondary pte set by the + * hardware it's up to gup-fast/gup to set the access bit in + * the primary pte or in the page structure. + */ + if (!shadow_accessed_mask) + goto out; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + young = _spte & PT_ACCESSED_MASK; + if (young) { + young = 1; + break; + } + spte = rmap_next(kvm, rmapp, spte); + } +out: + return young; +} + #define RMAP_RECYCLE_THRESHOLD 1000 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -965,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva) return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); } +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); +} + #ifdef MMU_DEBUG static int is_empty_shadow_page(u64 *spt) { diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 269aa53932e0..dbe34b931374 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); get_page(page); + SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -103,6 +105,7 @@ static inline void get_head_page_multiple(struct page *page, int nr) VM_BUG_ON(page != compound_head(page)); VM_BUG_ON(page_count(page) == 0); atomic_add(nr, &page->_count); + SetPageReferenced(page); } static inline void get_huge_page_tail(struct page *page) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index cbfab1e9957d..cc2e7dfea9d7 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -61,6 +61,16 @@ struct mmu_notifier_ops { struct mm_struct *mm, unsigned long address); + /* + * test_young is called to check the young/accessed bitflag in + * the secondary pte. This is used to know if the page is + * frequently used without actually clearing the flag or tearing + * down the secondary mapping on the page. + */ + int (*test_young)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. @@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address); +extern int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_test_young(mm, address); + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { @@ -313,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 915809b16edf..39d7df40c067 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1632,7 +1632,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON(PageLRU(page)); /* If there is no mapped pte young don't collapse the page */ - if (pte_young(pteval)) + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) referenced = 1; } if (unlikely(!referenced)) @@ -1892,7 +1893,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* cannot use mapcount: can't collapse if there's a gup pin */ if (page_count(page) != 1) goto out_unmap; - if (pte_young(pteval)) + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) referenced = 1; } if (referenced) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 438951d366f2..8d032de4088e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->test_young) { + young = mn->ops->test_young(mn, mm, address); + if (young) + break; + } + } + rcu_read_unlock(); + + return young; +} + void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 85ab7db0d366..4286d4766510 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -380,6 +380,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, return young; } +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + young = kvm_test_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + return young; +} + static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -396,6 +412,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .test_young = kvm_mmu_notifier_test_young, .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; -- cgit v1.2.3-71-gd317 From 5a03b051ed87e72b959f32a86054e1142ac4cf55 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:11 -0800 Subject: thp: use compaction in kswapd for GFP_ATOMIC order > 0 This takes advantage of memory compaction to properly generate pages of order > 0 if regular page reclaim fails and priority level becomes more severe and we don't reach the proper watermarks. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 11 ++++++++--- mm/compaction.c | 31 ++++++++++++++++++++++++++----- mm/vmscan.c | 30 ++++++++++++++++++++---------- 3 files changed, 54 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 72cba4034785..dfa2ed4c0d26 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -11,6 +11,9 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 3 +#define COMPACT_MODE_DIRECT_RECLAIM 0 +#define COMPACT_MODE_KSWAPD 1 + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -25,7 +28,8 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist, bool sync); extern unsigned long compaction_suitable(struct zone *zone, int order); extern unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, bool sync); + gfp_t gfp_mask, bool sync, + int compact_mode); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -70,9 +74,10 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order) } static inline unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, bool sync) + gfp_t gfp_mask, bool sync, + int compact_mode) { - return 0; + return COMPACT_CONTINUE; } static inline void defer_compaction(struct zone *zone) diff --git a/mm/compaction.c b/mm/compaction.c index da25b5a2e476..60d52a7298d5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,6 +42,8 @@ struct compact_control { unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; + + int compact_mode; }; static unsigned long release_freepages(struct list_head *freelist) @@ -382,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc) } static int compact_finished(struct zone *zone, - struct compact_control *cc) + struct compact_control *cc) { unsigned int order; - unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + unsigned long watermark; if (fatal_signal_pending(current)) return COMPACT_PARTIAL; @@ -395,12 +397,27 @@ static int compact_finished(struct zone *zone, return COMPACT_COMPLETE; /* Compaction run is not finished if the watermark is not met */ + if (cc->compact_mode != COMPACT_MODE_KSWAPD) + watermark = low_wmark_pages(zone); + else + watermark = high_wmark_pages(zone); + watermark += (1 << cc->order); + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) return COMPACT_CONTINUE; if (cc->order == -1) return COMPACT_CONTINUE; + /* + * Generating only one page of the right order is not enough + * for kswapd, we must continue until we're above the high + * watermark as a pool for high order GFP_ATOMIC allocations + * too. + */ + if (cc->compact_mode == COMPACT_MODE_KSWAPD) + return COMPACT_CONTINUE; + /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { /* Job done if page is free of the right migratetype */ @@ -514,8 +531,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask, - bool sync) + int order, gfp_t gfp_mask, + bool sync, + int compact_mode) { struct compact_control cc = { .nr_freepages = 0, @@ -524,6 +542,7 @@ unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, + .compact_mode = compact_mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -569,7 +588,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync); + status = compact_zone_order(zone, order, gfp_mask, sync, + COMPACT_MODE_DIRECT_RECLAIM); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -600,6 +620,7 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, + .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/vmscan.c b/mm/vmscan.c index cfdef0bcc7ab..f5b762ae23a2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -2382,6 +2383,7 @@ loop_again: * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { + int compaction; struct zone *zone = pgdat->node_zones + i; int nr_slab; @@ -2411,9 +2413,26 @@ loop_again: lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; + + compaction = 0; + if (order && + zone_watermark_ok(zone, 0, + high_wmark_pages(zone), + end_zone, 0) && + !zone_watermark_ok(zone, order, + high_wmark_pages(zone), + end_zone, 0)) { + compact_zone_order(zone, + order, + sc.gfp_mask, false, + COMPACT_MODE_KSWAPD); + compaction = 1; + } + if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && !zone_reclaimable(zone)) + if (!compaction && nr_slab == 0 && + !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -2424,15 +2443,6 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - /* - * Compact the zone for higher orders to reduce - * latencies for higher-order allocations that - * would ordinarily call try_to_compact_pages() - */ - if (sc.order > PAGE_ALLOC_COSTLY_ORDER) - compact_zone_order(zone, sc.order, sc.gfp_mask, - false); - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; -- cgit v1.2.3-71-gd317 From 2c888cfbc1b45508a44763d85ba2e8ac43faff5f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:47:13 -0800 Subject: thp: fix anon memory statistics with transparent hugepages Count each transparent hugepage as HPAGE_PMD_NR pages in the LRU statistics, so the Active(anon) and Inactive(anon) statistics in /proc/meminfo are correct. Signed-off-by: Rik van Riel Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 8 ++++++++ include/linux/mm_inline.h | 8 +++++--- mm/huge_memory.c | 10 ++++++++++ mm/memcontrol.c | 2 +- mm/vmscan.c | 11 ++++++----- 5 files changed, 30 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 827595228734..9b48c24df260 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -117,11 +117,19 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, return; __vma_adjust_trans_huge(vma, start, end, adjust_next); } +static inline int hpage_nr_pages(struct page *page) +{ + if (unlikely(PageTransHuge(page))) + return HPAGE_PMD_NR; + return 1; +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUG(); 0; }) +#define hpage_nr_pages(x) 1 + #define transparent_hugepage_enabled(__vma) 0 #define transparent_hugepage_flags 0UL diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 650f31eabdb1..8f7d24712dc1 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,6 +1,8 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include + /** * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test @@ -24,7 +26,7 @@ __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, struct list_head *head) { list_add(&page->lru, head); - __inc_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); mem_cgroup_add_lru_list(page, l); } @@ -38,7 +40,7 @@ static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { list_del(&page->lru); - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } @@ -73,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page) l += LRU_ACTIVE; } } - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 892d8a17a7e5..f4f6041176a4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1143,6 +1143,7 @@ static void __split_huge_page_refcount(struct page *page) int i; unsigned long head_index = page->index; struct zone *zone = page_zone(page); + int zonestat; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -1207,6 +1208,15 @@ static void __split_huge_page_refcount(struct page *page) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + /* + * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, + * so adjust those appropriately if this page is on the LRU. + */ + if (PageLRU(page)) { + zonestat = NR_LRU_BASE + page_lru(page); + __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); + } + ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1bb59d4c9d0..f4ea3410fb4d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1091,7 +1091,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* we don't affect global LRU but rotate in our LRU */ diff --git a/mm/vmscan.c b/mm/vmscan.c index f5b762ae23a2..0882014d2ce5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1045,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: @@ -1103,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); - nr_taken++; + nr_taken += hpage_nr_pages(page); nr_lumpy_taken++; if (PageDirty(cursor_page)) nr_lumpy_dirty++; @@ -1158,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); - nr_active++; + nr_active += numpages; } if (count) - count[lru]++; + count[lru] += numpages; } return nr_active; @@ -1483,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone, list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); - pgmoved++; + pgmoved += hpage_nr_pages(page); if (!pagevec_add(&pvec, page) || list_empty(list)) { spin_unlock_irq(&zone->lru_lock); -- cgit v1.2.3-71-gd317 From 37c2ac7872a9387542616f658d20ac25f5bdb32e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:16 -0800 Subject: thp: compound_trans_order Read compound_trans_order safe. Noop for CONFIG_TRANSPARENT_HUGEPAGE=n. Signed-off-by: Andrea Arcangeli Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 ++++++++++++++ mm/memcontrol.c | 12 ++++++------ mm/memory-failure.c | 12 ++++++------ 3 files changed, 26 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c2695beab86..ce97a2bb0b19 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -450,6 +450,20 @@ static inline int compound_order(struct page *page) return (unsigned long)page[1].lru.prev; } +static inline int compound_trans_order(struct page *page) +{ + int order; + unsigned long flags; + + if (!PageHead(page)) + return 0; + + flags = compound_lock_irqsave(page); + order = compound_order(page); + compound_unlock_irqrestore(page, flags); + return order; +} + static inline void set_compound_order(struct page *page, unsigned long order) { page[1].lru.prev = (void *)order; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f4ea3410fb4d..741206ffdace 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1027,10 +1027,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; - int page_size = PAGE_SIZE; - - if (PageTransHuge(page)) - page_size <<= compound_order(page); if (mem_cgroup_disabled()) return NULL; @@ -2286,8 +2282,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, int ret; int page_size = PAGE_SIZE; - if (PageTransHuge(page)) + if (PageTransHuge(page)) { page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -2558,8 +2556,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; - if (PageTransHuge(page)) + if (PageTransHuge(page)) { page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } count = page_size >> PAGE_SHIFT; /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1b43d0ffff65..548fbd70f026 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -930,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -938,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -968,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_order(hpage); + nr_pages = 1 << compound_trans_order(hpage); atomic_long_add(nr_pages, &mce_bad_pages); /* @@ -1166,7 +1166,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); + nr_pages = 1 << compound_trans_order(page); if (!get_page_unless_zero(page)) { /* @@ -1304,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags) } done: if (!PageHWPoison(hpage)) - atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); /* keep elevated page count for bad page */ -- cgit v1.2.3-71-gd317 From a664b2d8555c659127bf8fe049a58449d394a707 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:17 -0800 Subject: thp: madvise(MADV_NOHUGEPAGE) Add madvise MADV_NOHUGEPAGE to mark regions that are not important to be hugepage backed. Return -EINVAL if the vma is not of an anonymous type, or the feature isn't built into the kernel. Never silently return success. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 14 ++++++++------ include/linux/khugepaged.h | 7 ++++--- include/linux/mm.h | 1 + mm/huge_memory.c | 41 ++++++++++++++++++++++++++++++----------- mm/madvise.c | 4 +++- 5 files changed, 46 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9b48c24df260..a8b7e42d19ec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -52,10 +52,12 @@ extern pmd_t *page_check_address_pmd(struct page *page, #define HPAGE_PMD_SIZE HPAGE_SIZE #define transparent_hugepage_enabled(__vma) \ - (transparent_hugepage_flags & (1<vm_flags & VM_HUGEPAGE)) + ((transparent_hugepage_flags & \ + (1<vm_flags & VM_HUGEPAGE))) && \ + !((__vma)->vm_flags & VM_NOHUGEPAGE)) #define transparent_hugepage_defrag(__vma) \ ((transparent_hugepage_flags & \ (1< MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif -extern int hugepage_madvise(unsigned long *vm_flags); +extern int hugepage_madvise(unsigned long *vm_flags, int advice); extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -141,7 +143,7 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -static inline int hugepage_madvise(unsigned long *vm_flags) +static inline int hugepage_madvise(unsigned long *vm_flags, int advice) { BUG(); return 0; diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 552f3184756c..6b394f0b5148 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -38,9 +38,10 @@ static inline void khugepaged_exit(struct mm_struct *mm) static inline int khugepaged_enter(struct vm_area_struct *vma) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) - if (khugepaged_always() || - (khugepaged_req_madv() && - vma->vm_flags & VM_HUGEPAGE)) + if ((khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) && + !(vma->vm_flags & VM_NOHUGEPAGE)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index ce97a2bb0b19..956a35532f47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -83,6 +83,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSUP 0x00000200 #else #define VM_GROWSUP 0x00000000 +#define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */ #endif #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f4f6041176a4..fce667c0281d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1388,18 +1389,36 @@ out: return ret; } -int hugepage_madvise(unsigned long *vm_flags) +int hugepage_madvise(unsigned long *vm_flags, int advice) { - /* - * Be somewhat over-protective like KSM for now! - */ - if (*vm_flags & (VM_HUGEPAGE | VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) - return -EINVAL; - - *vm_flags |= VM_HUGEPAGE; + switch (advice) { + case MADV_HUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + break; + case MADV_NOHUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_NOHUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + break; + } return 0; } diff --git a/mm/madvise.c b/mm/madvise.c index ecde40a401c1..bbac126e03ed 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -72,7 +72,8 @@ static long madvise_behavior(struct vm_area_struct * vma, goto out; break; case MADV_HUGEPAGE: - error = hugepage_madvise(&new_flags); + case MADV_NOHUGEPAGE: + error = hugepage_madvise(&new_flags, behavior); if (error) goto out; break; @@ -290,6 +291,7 @@ madvise_behavior_valid(int behavior) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: #endif return 1; -- cgit v1.2.3-71-gd317 From 60ab3244ec85c44276c585a2a20d3750402e1cf4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:18 -0800 Subject: thp: khugepaged: make khugepaged aware about madvise MADV_HUGEPAGE and MADV_NOHUGEPAGE were fully effective only if run after mmap and before touching the memory. While this is enough for most usages, it's little effort to make madvise more dynamic at runtime on an existing mapping by making khugepaged aware about madvise. MADV_HUGEPAGE: register in khugepaged immediately without waiting a page fault (that may not ever happen if all pages are already mapped and the "enabled" knob was set to madvise during the initial page faults). MADV_NOHUGEPAGE: skip vmas marked VM_NOHUGEPAGE in khugepaged to stop collapsing pages where not needed. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Andrea Arcangeli Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 6 ++++-- mm/huge_memory.c | 23 +++++++++++++++++++---- mm/madvise.c | 2 +- 3 files changed, 24 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a8b7e42d19ec..bddfba1d7b85 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -105,7 +105,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #if HPAGE_PMD_ORDER > MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif -extern int hugepage_madvise(unsigned long *vm_flags, int advice); +extern int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice); extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -143,7 +144,8 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -static inline int hugepage_madvise(unsigned long *vm_flags, int advice) +static inline int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) { BUG(); return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fce667c0281d..004c9c2aac78 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1389,7 +1389,8 @@ out: return ret; } -int hugepage_madvise(unsigned long *vm_flags, int advice) +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: @@ -1404,6 +1405,13 @@ int hugepage_madvise(unsigned long *vm_flags, int advice) return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (unlikely(khugepaged_enter_vma_merge(vma))) + return -ENOMEM; break; case MADV_NOHUGEPAGE: /* @@ -1417,6 +1425,11 @@ int hugepage_madvise(unsigned long *vm_flags, int advice) return -EINVAL; *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ break; } @@ -1784,7 +1797,8 @@ static void collapse_huge_page(struct mm_struct *mm, if (address < hstart || address + HPAGE_PMD_SIZE > hend) goto out; - if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) goto out; /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ @@ -2007,8 +2021,9 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, break; } - if (!(vma->vm_flags & VM_HUGEPAGE) && - !khugepaged_always()) { + if ((!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) { progress++; continue; } diff --git a/mm/madvise.c b/mm/madvise.c index bbac126e03ed..2221491ed503 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -73,7 +73,7 @@ static long madvise_behavior(struct vm_area_struct * vma, break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: - error = hugepage_madvise(&new_flags, behavior); + error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; -- cgit v1.2.3-71-gd317 From 22e5c47ee238abe636655c3862ed28d6eb084ad4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:20 -0800 Subject: thp: add compound_trans_head() helper Cleanup some code with common compound_trans_head helper. Signed-off-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Johannes Weiner Cc: Marcelo Tosatti Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 18 ++++++++++++++++++ mm/ksm.c | 15 +++------------ virt/kvm/kvm_main.c | 38 ++++++++++++++------------------------ 3 files changed, 35 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bddfba1d7b85..8e6c8c42bc3c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -126,6 +126,23 @@ static inline int hpage_nr_pages(struct page *page) return HPAGE_PMD_NR; return 1; } +static inline struct page *compound_trans_head(struct page *page) +{ + if (PageTail(page)) { + struct page *head; + head = page->first_page; + smp_rmb(); + /* + * head may be a dangling pointer. + * __split_huge_page_refcount clears PageTail before + * overwriting first_page, so if PageTail is still + * there it means the head pointer isn't dangling. + */ + if (PageTail(page)) + return head; + } + return page; +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -144,6 +161,7 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) +#define compound_trans_head(page) compound_head(page) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { diff --git a/mm/ksm.c b/mm/ksm.c index 4d5a681923bb..33781de0b6bf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -415,20 +415,11 @@ out: static struct page *page_trans_compound_anon(struct page *page) { if (PageTransCompound(page)) { - struct page *head; - head = compound_head(page); + struct page *head = compound_trans_head(page); /* - * head may be a dangling pointer. - * __split_huge_page_refcount clears PageTail - * before overwriting first_page, so if - * PageTail is still there it means the head - * pointer isn't dangling. + * head may actually be splitted and freed from under + * us but it's ok here. */ - if (head != page) { - smp_rmb(); - if (!PageTransCompound(page)) - return NULL; - } if (PageAnon(head)) return head; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4286d4766510..f29abeb6a912 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -104,34 +104,24 @@ static pfn_t fault_pfn; inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { - struct page *head; + int reserved; struct page *tail = pfn_to_page(pfn); - head = compound_head(tail); + struct page *head = compound_trans_head(tail); + reserved = PageReserved(head); if (head != tail) { - smp_rmb(); /* - * head may be a dangling pointer. - * __split_huge_page_refcount clears PageTail - * before overwriting first_page, so if - * PageTail is still there it means the head - * pointer isn't dangling. + * "head" is not a dangling pointer + * (compound_trans_head takes care of that) + * but the hugepage may have been splitted + * from under us (and we may not hold a + * reference count on the head page so it can + * be reused before we run PageReferenced), so + * we've to check PageTail before returning + * what we just read. */ - if (PageTail(tail)) { - /* - * the "head" is not a dangling - * pointer but the hugepage may have - * been splitted from under us (and we - * may not hold a reference count on - * the head page so it can be reused - * before we run PageReferenced), so - * we've to recheck PageTail before - * returning what we just read. - */ - int reserved = PageReserved(head); - smp_rmb(); - if (PageTail(tail)) - return reserved; - } + smp_rmb(); + if (PageTail(tail)) + return reserved; } return PageReserved(tail); } -- cgit v1.2.3-71-gd317 From 29c1f677d424e8c5683a837fc4f03fc9f19201d7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:47:21 -0800 Subject: mm: migration: use rcu_dereference_protected when dereferencing the radix tree slot during file page migration migrate_pages() -> unmap_and_move() only calls rcu_read_lock() for anonymous pages, as introduced by git commit 989f89c57e6361e7d16fbd9572b5da7d313b073d ("fix rcu_read_lock() in page migraton"). The point of the RCU protection there is part of getting a stable reference to anon_vma and is only held for anon pages as file pages are locked which is sufficient protection against freeing. However, while a file page's mapping is being migrated, the radix tree is double checked to ensure it is the expected page. This uses radix_tree_deref_slot() -> rcu_dereference() without the RCU lock held triggering the following warning. [ 173.674290] =================================================== [ 173.676016] [ INFO: suspicious rcu_dereference_check() usage. ] [ 173.676016] --------------------------------------------------- [ 173.676016] include/linux/radix-tree.h:145 invoked rcu_dereference_check() without protection! [ 173.676016] [ 173.676016] other info that might help us debug this: [ 173.676016] [ 173.676016] [ 173.676016] rcu_scheduler_active = 1, debug_locks = 0 [ 173.676016] 1 lock held by hugeadm/2899: [ 173.676016] #0: (&(&inode->i_data.tree_lock)->rlock){..-.-.}, at: [] migrate_page_move_mapping+0x40/0x1ab [ 173.676016] [ 173.676016] stack backtrace: [ 173.676016] Pid: 2899, comm: hugeadm Not tainted 2.6.37-rc5-autobuild [ 173.676016] Call Trace: [ 173.676016] [] ? printk+0x14/0x1b [ 173.676016] [] lockdep_rcu_dereference+0x7d/0x86 [ 173.676016] [] migrate_page_move_mapping+0xca/0x1ab [ 173.676016] [] migrate_page+0x23/0x39 [ 173.676016] [] buffer_migrate_page+0x22/0x107 [ 173.676016] [] ? buffer_migrate_page+0x0/0x107 [ 173.676016] [] move_to_new_page+0x9a/0x1ae [ 173.676016] [] migrate_pages+0x1e7/0x2fa This patch introduces radix_tree_deref_slot_protected() which calls rcu_dereference_protected(). Users of it must pass in the mapping->tree_lock that is protecting this dereference. Holding the tree lock protects against parallel updaters of the radix tree meaning that rcu_dereference_protected is allowable. [akpm@linux-foundation.org: remove unneeded casts] Signed-off-by: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Milton Miller Cc: Nick Piggin Cc: Wu Fengguang Cc: [2.6.37.early] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 16 ++++++++++++++++ mm/migrate.c | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ab2baa5c4884..23241c2fecce 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -145,6 +145,22 @@ static inline void *radix_tree_deref_slot(void **pslot) return rcu_dereference(*pslot); } +/** + * radix_tree_deref_slot_protected - dereference a slot without RCU lock but with tree lock held + * @pslot: pointer to slot, returned by radix_tree_lookup_slot + * Returns: item that was stored in that slot with any direct pointer flag + * removed. + * + * Similar to radix_tree_deref_slot but only used during migration when a pages + * mapping is being moved. The caller does not hold the RCU read lock but it + * must hold the tree lock to prevent parallel updates. + */ +static inline void *radix_tree_deref_slot_protected(void **pslot, + spinlock_t *treelock) +{ + return rcu_dereference_protected(*pslot, lockdep_is_held(treelock)); +} + /** * radix_tree_deref_retry - check radix_tree_deref_slot * @arg: pointer returned by radix_tree_deref_slot diff --git a/mm/migrate.c b/mm/migrate.c index 1a531b760b3b..89a6bc8cd307 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -248,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -320,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } -- cgit v1.2.3-71-gd317 From db16d5ec1f87f17511599bc77857dd1662b5a22f Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 13 Jan 2011 15:47:35 -0800 Subject: memcg: add page_cgroup flags for dirty page tracking This patchset provides the ability for each cgroup to have independent dirty page limits. Limiting dirty memory is like fixing the max amount of dirty (hard to reclaim) page cache used by a cgroup. So, in case of multiple cgroup writers, they will not be able to consume more than their designated share of dirty pages and will be forced to perform write-out if they cross that limit. The patches are based on a series proposed by Andrea Righi in Mar 2010. Overview: - Add page_cgroup flags to record when pages are dirty, in writeback, or nfs unstable. - Extend mem_cgroup to record the total number of pages in each of the interesting dirty states (dirty, writeback, unstable_nfs). - Add dirty parameters similar to the system-wide /proc/sys/vm/dirty_* limits to mem_cgroup. The mem_cgroup dirty parameters are accessible via cgroupfs control files. - Consider both system and per-memcg dirty limits in page writeback when deciding to queue background writeback or block for foreground writeback. Known shortcomings: - When a cgroup dirty limit is exceeded, then bdi writeback is employed to writeback dirty inodes. Bdi writeback considers inodes from any cgroup, not just inodes contributing dirty pages to the cgroup exceeding its limit. - When memory.use_hierarchy is set, then dirty limits are disabled. This is a implementation detail. An enhanced implementation is needed to check the chain of parents to ensure that no dirty limit is exceeded. Performance data: - A page fault microbenchmark workload was used to measure performance, which can be called in read or write mode: f = open(foo. $cpu) truncate(f, 4096) alarm(60) while (1) { p = mmap(f, 4096) if (write) *p = 1 else x = *p munmap(p) } - The workload was called for several points in the patch series in different modes: - s_read is a single threaded reader - s_write is a single threaded writer - p_read is a 16 thread reader, each operating on a different file - p_write is a 16 thread writer, each operating on a different file - Measurements were collected on a 16 core non-numa system using "perf stat --repeat 3". The -a option was used for parallel (p_*) runs. - All numbers are page fault rate (M/sec). Higher is better. - To compare the performance of a kernel without non-memcg compare the first and last rows, neither has memcg configured. The first row does not include any of these memcg patches. - To compare the performance of using memcg dirty limits, compare the baseline (2nd row titled "w/ memcg") with the the code and memcg enabled (2nd to last row titled "all patches"). root_cgroup child_cgroup s_read s_write p_read p_write s_read s_write p_read p_write mmotm w/o memcg 0.428 0.390 0.429 0.388 mmotm w/ memcg 0.411 0.378 0.391 0.362 0.412 0.377 0.385 0.363 all patches 0.384 0.360 0.370 0.348 0.381 0.363 0.368 0.347 all patches 0.431 0.402 0.427 0.395 w/o memcg This patch: Add additional flags to page_cgroup to track dirty pages within a mem_cgroup. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrea Righi Signed-off-by: Greg Thelen Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index b02195dfc1b0..fdb5a92b5ac7 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -40,6 +40,9 @@ enum { PCG_USED, /* this object is in use. */ PCG_ACCT_LRU, /* page has been accounted for */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ + PCG_FILE_DIRTY, /* page is dirty */ + PCG_FILE_WRITEBACK, /* page is under writeback */ + PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ PCG_MIGRATION, /* under page migration */ }; @@ -59,6 +62,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ { return test_and_clear_bit(PCG_##lname, &pc->flags); } +#define TESTSETPCGFLAG(uname, lname) \ +static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \ + { return test_and_set_bit(PCG_##lname, &pc->flags); } + /* Cache flag is set only once (at allocation) */ TESTPCGFLAG(Cache, CACHE) CLEARPCGFLAG(Cache, CACHE) @@ -78,6 +85,22 @@ SETPCGFLAG(FileMapped, FILE_MAPPED) CLEARPCGFLAG(FileMapped, FILE_MAPPED) TESTPCGFLAG(FileMapped, FILE_MAPPED) +SETPCGFLAG(FileDirty, FILE_DIRTY) +CLEARPCGFLAG(FileDirty, FILE_DIRTY) +TESTPCGFLAG(FileDirty, FILE_DIRTY) +TESTCLEARPCGFLAG(FileDirty, FILE_DIRTY) +TESTSETPCGFLAG(FileDirty, FILE_DIRTY) + +SETPCGFLAG(FileWriteback, FILE_WRITEBACK) +CLEARPCGFLAG(FileWriteback, FILE_WRITEBACK) +TESTPCGFLAG(FileWriteback, FILE_WRITEBACK) + +SETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +CLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTCLEARPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) +TESTSETPCGFLAG(FileUnstableNFS, FILE_UNSTABLE_NFS) + SETPCGFLAG(Migration, MIGRATION) CLEARPCGFLAG(Migration, MIGRATION) TESTPCGFLAG(Migration, MIGRATION) -- cgit v1.2.3-71-gd317 From 2a7106f2cb0768d00fe8c1eb42a754a7d8518f08 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 13 Jan 2011 15:47:37 -0800 Subject: memcg: create extensible page stat update routines Replace usage of the mem_cgroup_update_file_mapped() memcg statistic update routine with two new routines: * mem_cgroup_inc_page_stat() * mem_cgroup_dec_page_stat() As before, only the file_mapped statistic is managed. However, these more general interfaces allow for new statistics to be more easily added. New statistics are added with memcg dirty page accounting. Signed-off-by: Greg Thelen Signed-off-by: Andrea Righi Acked-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 31 ++++++++++++++++++++++++++++--- mm/memcontrol.c | 16 +++++++--------- mm/rmap.c | 4 ++-- 3 files changed, 37 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 159a0762aeaf..067115ce6b3e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -25,6 +25,11 @@ struct page_cgroup; struct page; struct mm_struct; +/* Stats that can be updated by kernel. */ +enum mem_cgroup_page_stat_item { + MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ +}; + extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, @@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void) return false; } -void mem_cgroup_update_file_mapped(struct page *page, int val); +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, + int val); + +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, 1); +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, -1); +} + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); @@ -293,8 +313,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_update_file_mapped(struct page *page, - int val) +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 741206ffdace..3d8a0c79dece 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1600,7 +1600,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * possibility of race condition. If there is, we take a lock. */ -static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1623,30 +1624,27 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) goto out; } - this_cpu_add(mem->stat->count[idx], val); - switch (idx) { - case MEM_CGROUP_STAT_FILE_MAPPED: + case MEMCG_NR_FILE_MAPPED: if (val > 0) SetPageCgroupFileMapped(pc); else if (!page_mapped(page)) ClearPageCgroupFileMapped(pc); + idx = MEM_CGROUP_STAT_FILE_MAPPED; break; default: BUG(); } + this_cpu_add(mem->stat->count[idx], val); + out: if (unlikely(need_unlock)) unlock_page_cgroup(pc); rcu_read_unlock(); return; } - -void mem_cgroup_update_file_mapped(struct page *page, int val) -{ - mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); -} +EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. diff --git a/mm/rmap.c b/mm/rmap.c index c30f33854f97..f21f4a1d6a1c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -937,7 +937,7 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, 1); + mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); } } @@ -979,7 +979,7 @@ void page_remove_rmap(struct page *page) NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, -1); + mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); } /* * It would be tidy to reset the PageAnon mapping here, -- cgit v1.2.3-71-gd317 From dbd4ea78f002df283c95d9774837041735fa1bf9 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 13 Jan 2011 15:47:38 -0800 Subject: memcg: add lock to synchronize page accounting and migration Introduce a new bit spin lock, PCG_MOVE_LOCK, to synchronize the page accounting and migration code. This reworks the locking scheme of _update_stat() and _move_account() by adding new lock bit PCG_MOVE_LOCK, which is always taken under IRQ disable. 1. If pages are being migrated from a memcg, then updates to that memcg page statistics are protected by grabbing PCG_MOVE_LOCK using move_lock_page_cgroup(). In an upcoming commit, memcg dirty page accounting will be updating memcg page accounting (specifically: num writeback pages) from IRQ context (softirq). Avoid a deadlocking nested spin lock attempt by disabling irq on the local processor when grabbing the PCG_MOVE_LOCK. 2. lock for update_page_stat is used only for avoiding race with move_account(). So, IRQ awareness of lock_page_cgroup() itself is not a problem. The problem is between mem_cgroup_update_page_stat() and mem_cgroup_move_account_page(). Trade-off: * Changing lock_page_cgroup() to always disable IRQ (or local_bh) has some impacts on performance and I think it's bad to disable IRQ when it's not necessary. * adding a new lock makes move_account() slower. Score is here. Performance Impact: moving a 8G anon process. Before: real 0m0.792s user 0m0.000s sys 0m0.780s After: real 0m0.854s user 0m0.000s sys 0m0.842s This score is bad but planned patches for optimization can reduce this impact. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Greg Thelen Reviewed-by: Minchan Kim Acked-by: Daisuke Nishimura Cc: Andrea Righi Cc: Balbir Singh Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 31 ++++++++++++++++++++++++++++--- mm/memcontrol.c | 9 +++++++-- 2 files changed, 35 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index fdb5a92b5ac7..5b0c971d7cae 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -35,15 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page); enum { /* flags for mem_cgroup */ - PCG_LOCK, /* page cgroup is locked */ + PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ - PCG_ACCT_LRU, /* page has been accounted for */ + PCG_MIGRATION, /* under page migration */ + /* flags for mem_cgroup and file and I/O status */ + PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ PCG_FILE_DIRTY, /* page is dirty */ PCG_FILE_WRITEBACK, /* page is under writeback */ PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ - PCG_MIGRATION, /* under page migration */ + /* No lock in page_cgroup */ + PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ }; #define TESTPCGFLAG(uname, lname) \ @@ -117,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) static inline void lock_page_cgroup(struct page_cgroup *pc) { + /* + * Don't take this lock in IRQ context. + * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION + */ bit_spin_lock(PCG_LOCK, &pc->flags); } @@ -130,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc) return bit_spin_is_locked(PCG_LOCK, &pc->flags); } +static inline void move_lock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + /* + * We know updates to pc->flags of page cache's stats are from both of + * usual context or IRQ context. Disable IRQ to avoid deadlock. + */ + local_irq_save(*flags); + bit_spin_lock(PCG_MOVE_LOCK, &pc->flags); +} + +static inline void move_unlock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags); + local_irq_restore(*flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3d8a0c79dece..d888956a2cfc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1606,6 +1606,7 @@ void mem_cgroup_update_page_stat(struct page *page, struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); bool need_unlock = false; + unsigned long uninitialized_var(flags); if (unlikely(!pc)) return; @@ -1617,7 +1618,7 @@ void mem_cgroup_update_page_stat(struct page *page, /* pc->mem_cgroup is unstable ? */ if (unlikely(mem_cgroup_stealed(mem))) { /* take a lock against to access pc->mem_cgroup */ - lock_page_cgroup(pc); + move_lock_page_cgroup(pc, &flags); need_unlock = true; mem = pc->mem_cgroup; if (!mem || !PageCgroupUsed(pc)) @@ -1640,7 +1641,7 @@ void mem_cgroup_update_page_stat(struct page *page, out: if (unlikely(need_unlock)) - unlock_page_cgroup(pc); + move_unlock_page_cgroup(pc, &flags); rcu_read_unlock(); return; } @@ -2211,9 +2212,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) { int ret = -EINVAL; + unsigned long flags; + lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { + move_lock_page_cgroup(pc, &flags); __mem_cgroup_move_account(pc, from, to, uncharge); + move_unlock_page_cgroup(pc, &flags); ret = 0; } unlock_page_cgroup(pc); -- cgit v1.2.3-71-gd317 From 50de1dd967d4ba3b8a90ebe7a4f5feca24191317 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:47:43 -0800 Subject: memcg: fix memory migration of shmem swapcache In the current implementation mem_cgroup_end_migration() decides whether the page migration has succeeded or not by checking "oldpage->mapping". But if we are tring to migrate a shmem swapcache, the page->mapping of it is NULL from the begining, so the check would be invalid. As a result, mem_cgroup_end_migration() assumes the migration has succeeded even if it's not, so "newpage" would be freed while it's not uncharged. This patch fixes it by passing mem_cgroup_end_migration() the result of the page migration. Signed-off-by: Daisuke Nishimura Reviewed-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Minchan Kim Reviewed-by: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- mm/memcontrol.c | 5 ++--- mm/migrate.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 067115ce6b3e..6a576f989437 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -98,7 +98,7 @@ extern int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage); + struct page *oldpage, struct page *newpage, bool migration_ok); /* * For memory reclaim. @@ -251,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, } static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, - struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6424ba0fce83..8ab841031436 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2896,7 +2896,7 @@ int mem_cgroup_prepare_migration(struct page *page, /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; @@ -2905,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, return; /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); - /* at migration success, oldpage->mapping is NULL. */ - if (oldpage->mapping) { + if (!migration_ok) { used = oldpage; unused = newpage; } else { diff --git a/mm/migrate.c b/mm/migrate.c index 5b7d1fd29621..46fe8cc13d67 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -768,7 +768,7 @@ skip_unmap: uncharge: if (!charge) - mem_cgroup_end_migration(mem, page, newpage); + mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); -- cgit v1.2.3-71-gd317 From bba958783b1b4cb0a9420f4e11082467132a334c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 14 Jan 2011 15:57:47 +0900 Subject: mmc: sh_mmcif: Convert to __raw_xxx() I/O accessors. When using the I/O accessors in raw mode from the boot stubs we don't want to bother with any of the complexity associated with readl/writel and friends. Furthermore, utilization within the context of the host driver itself is all performed on an ioremapped window, so using the __raw variants there doesn't pose any problem either. If and when barriers need to be added in the future, these will need to be explicitly written out, but this is so far not a concern for any of the affected CPUs in question. This fixes up the link error introduced by the ARM tree via its barrier refactoring: arch/arm/boot/compressed/mmcif-sh7372.o: In function `mmcif_loader': mmcif-sh7372.c:(.text+0x9e8): undefined reference to `outer_cache Following the change in: http://www.arm.linux.org.uk/developer/patches/viewpatch.php?id=6275/1 Reported-by: Simon Horman Signed-off-by: Paul Mundt --- include/linux/mmc/sh_mmcif.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h index bf173502d744..38d393092812 100644 --- a/include/linux/mmc/sh_mmcif.h +++ b/include/linux/mmc/sh_mmcif.h @@ -94,12 +94,12 @@ struct sh_mmcif_plat_data { static inline u32 sh_mmcif_readl(void __iomem *addr, int reg) { - return readl(addr + reg); + return __raw_readl(addr + reg); } static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val) { - writel(val, addr + reg); + __raw_writel(val, addr + reg); } #define SH_MMCIF_BBS 512 /* boot block size */ -- cgit v1.2.3-71-gd317