From 2f9f28b212a2bd4948c8ceaaec33ce0123632129 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 4 Apr 2011 15:19:25 +0200 Subject: netfilter: ipset: references are protected by rwlock instead of mutex The timeout variant of the list:set type must reference the member sets. However, its garbage collector runs at timer interrupt so the mutex protection of the references is a no go. Therefore the reference protection is converted to rwlock. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy --- include/linux/netfilter/ipset/ip_set.h | 2 +- include/linux/netfilter/ipset/ip_set_ahash.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index ec333d83f3b4..5a262e3ae715 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -293,7 +293,7 @@ struct ip_set { /* Lock protecting the set data */ rwlock_t lock; /* References to the set */ - atomic_t ref; + u32 ref; /* The core set type */ struct ip_set_type *type; /* The type variant doing the real job */ diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h index ec9d9bea1e37..a0196ac79051 100644 --- a/include/linux/netfilter/ipset/ip_set_ahash.h +++ b/include/linux/netfilter/ipset/ip_set_ahash.h @@ -515,8 +515,7 @@ type_pf_head(struct ip_set *set, struct sk_buff *skb) if (h->netmask != HOST_MASK) NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, h->netmask); #endif - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, - htonl(atomic_read(&set->ref) - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)); if (with_timeout(h->timeout)) NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout)); -- cgit v1.2.3-71-gd317 From 31ad3dd64e689bc79dd819f8f134b9b025240eb8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Apr 2011 16:56:29 +0200 Subject: netfilter: af_info: add network namespace parameter to route hook This is required to eventually replace the rt6_lookup call in xt_addrtype.c with nf_afinfo->route(). Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Patrick McHardy --- include/linux/netfilter.h | 3 ++- net/ipv4/netfilter.c | 5 +++-- net/ipv6/netfilter.c | 5 +++-- net/netfilter/nf_conntrack_h323_main.c | 8 ++++---- net/netfilter/xt_TCPMSS.c | 2 +- 5 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index eeec00abb664..20ed4528e850 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -270,7 +270,8 @@ struct nf_afinfo { unsigned int dataoff, unsigned int len, u_int8_t protocol); - int (*route)(struct dst_entry **dst, struct flowi *fl); + int (*route)(struct net *net, struct dst_entry **dst, + struct flowi *fl); void (*saveroute)(const struct sk_buff *skb, struct nf_queue_entry *entry); int (*reroute)(struct sk_buff *skb, diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index f3c0b549b8e1..f1035f056503 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -221,9 +221,10 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, return csum; } -static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) +static int nf_ip_route(struct net *net, struct dst_entry **dst, + struct flowi *fl) { - struct rtable *rt = ip_route_output_key(&init_net, &fl->u.ip4); + struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); if (IS_ERR(rt)) return PTR_ERR(rt); *dst = &rt->dst; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 39aaca2b4fd2..e008b9b4a779 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -90,9 +90,10 @@ static int nf_ip6_reroute(struct sk_buff *skb, return 0; } -static int nf_ip6_route(struct dst_entry **dst, struct flowi *fl) +static int nf_ip6_route(struct net *net, struct dst_entry **dst, + struct flowi *fl) { - *dst = ip6_route_output(&init_net, NULL, &fl->u.ip6); + *dst = ip6_route_output(net, NULL, &fl->u.ip6); return (*dst)->error; } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 533a183e6661..39a453895b4d 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -731,9 +731,9 @@ static int callforward_do_filter(const union nf_inet_addr *src, memset(&fl2, 0, sizeof(fl2)); fl2.daddr = dst->ip; - if (!afinfo->route((struct dst_entry **)&rt1, + if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, flowi4_to_flowi(&fl1))) { - if (!afinfo->route((struct dst_entry **)&rt2, + if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, flowi4_to_flowi(&fl2))) { if (rt1->rt_gateway == rt2->rt_gateway && rt1->dst.dev == rt2->dst.dev) @@ -755,9 +755,9 @@ static int callforward_do_filter(const union nf_inet_addr *src, memset(&fl2, 0, sizeof(fl2)); ipv6_addr_copy(&fl2.daddr, &dst->in6); - if (!afinfo->route((struct dst_entry **)&rt1, + if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, flowi6_to_flowi(&fl1))) { - if (!afinfo->route((struct dst_entry **)&rt2, + if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, flowi6_to_flowi(&fl2))) { if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, sizeof(rt1->rt6i_gateway)) && diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 6e6b46cb1db9..8690125e3b18 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -166,7 +166,7 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, rcu_read_lock(); ai = nf_get_afinfo(family); if (ai != NULL) - ai->route((struct dst_entry **)&rt, &fl); + ai->route(&init_net, (struct dst_entry **)&rt, &fl); rcu_read_unlock(); if (rt != NULL) { -- cgit v1.2.3-71-gd317 From 0fae2e7740aca7e384c5f337f458897e7e337d58 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Apr 2011 17:00:54 +0200 Subject: netfilter: af_info: add 'strict' parameter to limit lookup to .oif ipv6 fib lookup can set RT6_LOOKUP_F_IFACE flag to restrict search to an interface, but this flag cannot be set via struct flowi. Also, it cannot be set via ip6_route_output: this function uses the passed sock struct to determine if this flag is required (by testing for nonzero sk_bound_dev_if). Work around this by passing in an artificial struct sk in case 'strict' argument is true. This is required to replace the rt6_lookup call in xt_addrtype.c with nf_afinfo->route(). Signed-off-by: Florian Westphal Acked-by: David S. Miller Signed-off-by: Patrick McHardy --- include/linux/netfilter.h | 2 +- net/ipv4/netfilter.c | 2 +- net/ipv6/netfilter.c | 12 ++++++++++-- net/netfilter/nf_conntrack_h323_main.c | 8 ++++---- net/netfilter/xt_TCPMSS.c | 2 +- 5 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 20ed4528e850..7fa95df60146 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -271,7 +271,7 @@ struct nf_afinfo { unsigned int len, u_int8_t protocol); int (*route)(struct net *net, struct dst_entry **dst, - struct flowi *fl); + struct flowi *fl, bool strict); void (*saveroute)(const struct sk_buff *skb, struct nf_queue_entry *entry); int (*reroute)(struct sk_buff *skb, diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index f1035f056503..4614babdc45f 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -222,7 +222,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, } static int nf_ip_route(struct net *net, struct dst_entry **dst, - struct flowi *fl) + struct flowi *fl, bool strict __always_unused) { struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); if (IS_ERR(rt)) diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index e008b9b4a779..28bc1f644b7b 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -91,9 +91,17 @@ static int nf_ip6_reroute(struct sk_buff *skb, } static int nf_ip6_route(struct net *net, struct dst_entry **dst, - struct flowi *fl) + struct flowi *fl, bool strict) { - *dst = ip6_route_output(net, NULL, &fl->u.ip6); + static const struct ipv6_pinfo fake_pinfo; + static const struct inet_sock fake_sk = { + /* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */ + .sk.sk_bound_dev_if = 1, + .pinet6 = (struct ipv6_pinfo *) &fake_pinfo, + }; + const void *sk = strict ? &fake_sk : NULL; + + *dst = ip6_route_output(net, sk, &fl->u.ip6); return (*dst)->error; } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 39a453895b4d..18b2ce5c8ced 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -732,9 +732,9 @@ static int callforward_do_filter(const union nf_inet_addr *src, memset(&fl2, 0, sizeof(fl2)); fl2.daddr = dst->ip; if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, - flowi4_to_flowi(&fl1))) { + flowi4_to_flowi(&fl1), false)) { if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, - flowi4_to_flowi(&fl2))) { + flowi4_to_flowi(&fl2), false)) { if (rt1->rt_gateway == rt2->rt_gateway && rt1->dst.dev == rt2->dst.dev) ret = 1; @@ -756,9 +756,9 @@ static int callforward_do_filter(const union nf_inet_addr *src, memset(&fl2, 0, sizeof(fl2)); ipv6_addr_copy(&fl2.daddr, &dst->in6); if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, - flowi6_to_flowi(&fl1))) { + flowi6_to_flowi(&fl1), false)) { if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, - flowi6_to_flowi(&fl2))) { + flowi6_to_flowi(&fl2), false)) { if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, sizeof(rt1->rt6i_gateway)) && rt1->dst.dev == rt2->dst.dev) diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 8690125e3b18..9e63b43faeed 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -166,7 +166,7 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, rcu_read_lock(); ai = nf_get_afinfo(family); if (ai != NULL) - ai->route(&init_net, (struct dst_entry **)&rt, &fl); + ai->route(&init_net, (struct dst_entry **)&rt, &fl, false); rcu_read_unlock(); if (rt != NULL) { -- cgit v1.2.3-71-gd317 From 34206f267120c839a479d0237db907fa062e7b0f Mon Sep 17 00:00:00 2001 From: Enric Balletbo i Serra Date: Tue, 5 Apr 2011 07:08:41 +0000 Subject: can: mcp251x: Allow pass IRQ flags through platform data. When an interrupt occurs, the INT pin is driven low by the MCP251x controller (falling edge) but in some cases the INT pin can be connected to the MPU through a transistor or level translator which inverts this signal. In this case interrupt should be configured in rising edge. This patch adds support to pass the IRQ flags via mcp251x_platform_data. Signed-off-by: Enric Balletbo i Serra Acked-by: Wolfgang Grandegger Acked-by: Marc Kleine-Budde Signed-off-by: David S. Miller --- drivers/net/can/mcp251x.c | 3 ++- include/linux/can/platform/mcp251x.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c index 7513c4523ac4..330140ee266d 100644 --- a/drivers/net/can/mcp251x.c +++ b/drivers/net/can/mcp251x.c @@ -931,7 +931,8 @@ static int mcp251x_open(struct net_device *net) priv->tx_len = 0; ret = request_threaded_irq(spi->irq, NULL, mcp251x_can_ist, - IRQF_TRIGGER_FALLING, DEVICE_NAME, priv); + pdata->irq_flags ? pdata->irq_flags : IRQF_TRIGGER_FALLING, + DEVICE_NAME, priv); if (ret) { dev_err(&spi->dev, "failed to acquire irq %d\n", spi->irq); if (pdata->transceiver_enable) diff --git a/include/linux/can/platform/mcp251x.h b/include/linux/can/platform/mcp251x.h index 8e20540043f5..089fe43211a4 100644 --- a/include/linux/can/platform/mcp251x.h +++ b/include/linux/can/platform/mcp251x.h @@ -12,6 +12,7 @@ /** * struct mcp251x_platform_data - MCP251X SPI CAN controller platform data * @oscillator_frequency: - oscillator frequency in Hz + * @irq_flags: - IRQF configuration flags * @board_specific_setup: - called before probing the chip (power,reset) * @transceiver_enable: - called to power on/off the transceiver * @power_enable: - called to power on/off the mcp *and* the @@ -24,6 +25,7 @@ struct mcp251x_platform_data { unsigned long oscillator_frequency; + unsigned long irq_flags; int (*board_specific_setup)(struct spi_device *spi); int (*transceiver_enable)(int enable); int (*power_enable) (int enable); -- cgit v1.2.3-71-gd317 From 1f112cee07b314e244ee9e71d9c1e6950dc13327 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 11 Apr 2011 22:54:42 +0200 Subject: PM / Hibernate: Introduce CONFIG_HIBERNATE_CALLBACKS Xen save/restore is going to use hibernate device callbacks for quiescing devices and putting them back to normal operations and it would need to select CONFIG_HIBERNATION for this purpose. However, that also would cause the hibernate interfaces for user space to be enabled, which might confuse user space, because the Xen kernels don't support hibernation. Moreover, it would be wasteful, as it would make the Xen kernels include a substantial amount of code that they would never use. To address this issue introduce new power management Kconfig option CONFIG_HIBERNATE_CALLBACKS, such that it will only select the code that is necessary for the hibernate device callbacks to work and make CONFIG_HIBERNATION select it. Then, Xen save/restore will be able to select CONFIG_HIBERNATE_CALLBACKS without dragging the entire hibernate code along with it. Signed-off-by: Rafael J. Wysocki Tested-by: Shriram Rajagopalan --- arch/powerpc/kernel/ibmebus.c | 6 +++--- drivers/amba/bus.c | 6 +++--- drivers/base/platform.c | 6 +++--- drivers/base/power/main.c | 8 ++++---- drivers/pci/pci-driver.c | 6 +++--- drivers/xen/manage.c | 6 +++--- include/linux/suspend.h | 11 +++-------- kernel/power/Kconfig | 6 +++++- 8 files changed, 27 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index c00d4ca1ee15..28581f1ad2c0 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -527,7 +527,7 @@ static int ibmebus_bus_pm_resume_noirq(struct device *dev) #endif /* !CONFIG_SUSPEND */ -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_HIBERNATE_CALLBACKS static int ibmebus_bus_pm_freeze(struct device *dev) { @@ -665,7 +665,7 @@ static int ibmebus_bus_pm_restore_noirq(struct device *dev) return ret; } -#else /* !CONFIG_HIBERNATION */ +#else /* !CONFIG_HIBERNATE_CALLBACKS */ #define ibmebus_bus_pm_freeze NULL #define ibmebus_bus_pm_thaw NULL @@ -676,7 +676,7 @@ static int ibmebus_bus_pm_restore_noirq(struct device *dev) #define ibmebus_bus_pm_poweroff_noirq NULL #define ibmebus_bus_pm_restore_noirq NULL -#endif /* !CONFIG_HIBERNATION */ +#endif /* !CONFIG_HIBERNATE_CALLBACKS */ static struct dev_pm_ops ibmebus_bus_dev_pm_ops = { .prepare = ibmebus_bus_pm_prepare, diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index 821040503154..7025593a58c8 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -214,7 +214,7 @@ static int amba_pm_resume_noirq(struct device *dev) #endif /* !CONFIG_SUSPEND */ -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_HIBERNATE_CALLBACKS static int amba_pm_freeze(struct device *dev) { @@ -352,7 +352,7 @@ static int amba_pm_restore_noirq(struct device *dev) return ret; } -#else /* !CONFIG_HIBERNATION */ +#else /* !CONFIG_HIBERNATE_CALLBACKS */ #define amba_pm_freeze NULL #define amba_pm_thaw NULL @@ -363,7 +363,7 @@ static int amba_pm_restore_noirq(struct device *dev) #define amba_pm_poweroff_noirq NULL #define amba_pm_restore_noirq NULL -#endif /* !CONFIG_HIBERNATION */ +#endif /* !CONFIG_HIBERNATE_CALLBACKS */ #ifdef CONFIG_PM diff --git a/drivers/base/platform.c b/drivers/base/platform.c index f051cfff18af..92792313d24a 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -771,7 +771,7 @@ int __weak platform_pm_resume_noirq(struct device *dev) #endif /* !CONFIG_SUSPEND */ -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_HIBERNATE_CALLBACKS static int platform_pm_freeze(struct device *dev) { @@ -909,7 +909,7 @@ static int platform_pm_restore_noirq(struct device *dev) return ret; } -#else /* !CONFIG_HIBERNATION */ +#else /* !CONFIG_HIBERNATE_CALLBACKS */ #define platform_pm_freeze NULL #define platform_pm_thaw NULL @@ -920,7 +920,7 @@ static int platform_pm_restore_noirq(struct device *dev) #define platform_pm_poweroff_noirq NULL #define platform_pm_restore_noirq NULL -#endif /* !CONFIG_HIBERNATION */ +#endif /* !CONFIG_HIBERNATE_CALLBACKS */ #ifdef CONFIG_PM_RUNTIME diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 052dc53eef38..fbc5b6e7c591 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -233,7 +233,7 @@ static int pm_op(struct device *dev, } break; #endif /* CONFIG_SUSPEND */ -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: if (ops->freeze) { @@ -260,7 +260,7 @@ static int pm_op(struct device *dev, suspend_report_result(ops->restore, error); } break; -#endif /* CONFIG_HIBERNATION */ +#endif /* CONFIG_HIBERNATE_CALLBACKS */ default: error = -EINVAL; } @@ -308,7 +308,7 @@ static int pm_noirq_op(struct device *dev, } break; #endif /* CONFIG_SUSPEND */ -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: if (ops->freeze_noirq) { @@ -335,7 +335,7 @@ static int pm_noirq_op(struct device *dev, suspend_report_result(ops->restore_noirq, error); } break; -#endif /* CONFIG_HIBERNATION */ +#endif /* CONFIG_HIBERNATE_CALLBACKS */ default: error = -EINVAL; } diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index d86ea8b01137..135df164a4c1 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -781,7 +781,7 @@ static int pci_pm_resume(struct device *dev) #endif /* !CONFIG_SUSPEND */ -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_HIBERNATE_CALLBACKS static int pci_pm_freeze(struct device *dev) { @@ -970,7 +970,7 @@ static int pci_pm_restore(struct device *dev) return error; } -#else /* !CONFIG_HIBERNATION */ +#else /* !CONFIG_HIBERNATE_CALLBACKS */ #define pci_pm_freeze NULL #define pci_pm_freeze_noirq NULL @@ -981,7 +981,7 @@ static int pci_pm_restore(struct device *dev) #define pci_pm_restore NULL #define pci_pm_restore_noirq NULL -#endif /* !CONFIG_HIBERNATION */ +#endif /* !CONFIG_HIBERNATE_CALLBACKS */ #ifdef CONFIG_PM_RUNTIME diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 95143dd6904d..1ac94125bf93 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -61,7 +61,7 @@ static void xen_post_suspend(int cancelled) xen_mm_unpin_all(); } -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_HIBERNATE_CALLBACKS static int xen_suspend(void *data) { struct suspend_info *si = data; @@ -173,7 +173,7 @@ out: #endif shutting_down = SHUTDOWN_INVALID; } -#endif /* CONFIG_HIBERNATION */ +#endif /* CONFIG_HIBERNATE_CALLBACKS */ struct shutdown_handler { const char *command; @@ -202,7 +202,7 @@ static void shutdown_handler(struct xenbus_watch *watch, { "poweroff", do_poweroff }, { "halt", do_poweroff }, { "reboot", do_reboot }, -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_HIBERNATE_CALLBACKS { "suspend", do_suspend }, #endif {NULL, NULL}, diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 5a89e3612875..083ffea7ba18 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -249,6 +249,8 @@ extern void hibernation_set_ops(const struct platform_hibernation_ops *ops); extern int hibernate(void); extern bool system_entering_hibernation(void); #else /* CONFIG_HIBERNATION */ +static inline void register_nosave_region(unsigned long b, unsigned long e) {} +static inline void register_nosave_region_late(unsigned long b, unsigned long e) {} static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } static inline void swsusp_set_page_free(struct page *p) {} static inline void swsusp_unset_page_free(struct page *p) {} @@ -297,14 +299,7 @@ static inline bool pm_wakeup_pending(void) { return false; } extern struct mutex pm_mutex; -#ifndef CONFIG_HIBERNATION -static inline void register_nosave_region(unsigned long b, unsigned long e) -{ -} -static inline void register_nosave_region_late(unsigned long b, unsigned long e) -{ -} - +#ifndef CONFIG_HIBERNATE_CALLBACKS static inline void lock_system_sleep(void) {} static inline void unlock_system_sleep(void) {} diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4603f08dc47b..049791468d37 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -18,9 +18,13 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config HIBERNATE_CALLBACKS + bool + config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATE_CALLBACKS select LZO_COMPRESS select LZO_DECOMPRESS ---help--- @@ -85,7 +89,7 @@ config PM_STD_PARTITION config PM_SLEEP def_bool y - depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE + depends on SUSPEND || HIBERNATE_CALLBACKS || XEN_SAVE_RESTORE config PM_SLEEP_SMP def_bool y -- cgit v1.2.3-71-gd317 From e710d7d5a9cab1041b7a3cf9e655b75d92786857 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Fri, 8 Apr 2011 00:43:01 +0200 Subject: mfd: Fetch cell pointer from platform_device->mfd_cell In order for MFD drivers to fetch their cell pointer but also their platform data one, an mfd cell pointer is added to the platform_device structure. That allows all MFD sub devices drivers to be MFD agnostic, unless they really need to access their MFD cell data. Most of them don't, especially the ones for IPs used by both MFD and non MFD SoCs. Cc: Grant Likely Acked-by: Greg KH Signed-off-by: Samuel Ortiz --- drivers/base/platform.c | 1 + drivers/mfd/mfd-core.c | 16 ++++++++++++++-- include/linux/mfd/core.h | 13 +++++++++++-- include/linux/platform_device.h | 5 +++++ 4 files changed, 31 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index f051cfff18af..6c3a2bdc527a 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -149,6 +149,7 @@ static void platform_device_release(struct device *dev) of_device_node_put(&pa->pdev.dev); kfree(pa->pdev.dev.platform_data); + kfree(pa->pdev.mfd_cell); kfree(pa->pdev.resource); kfree(pa); } diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c index d01574d98870..f4c8c844b913 100644 --- a/drivers/mfd/mfd-core.c +++ b/drivers/mfd/mfd-core.c @@ -55,6 +55,19 @@ int mfd_cell_disable(struct platform_device *pdev) } EXPORT_SYMBOL(mfd_cell_disable); +static int mfd_platform_add_cell(struct platform_device *pdev, + const struct mfd_cell *cell) +{ + if (!cell) + return 0; + + pdev->mfd_cell = kmemdup(cell, sizeof(*cell), GFP_KERNEL); + if (!pdev->mfd_cell) + return -ENOMEM; + + return 0; +} + static int mfd_add_device(struct device *parent, int id, const struct mfd_cell *cell, struct resource *mem_base, @@ -75,7 +88,7 @@ static int mfd_add_device(struct device *parent, int id, pdev->dev.parent = parent; - ret = platform_device_add_data(pdev, cell, sizeof(*cell)); + ret = mfd_platform_add_cell(pdev, cell); if (ret) goto fail_res; @@ -123,7 +136,6 @@ static int mfd_add_device(struct device *parent, int id, return 0; -/* platform_device_del(pdev); */ fail_res: kfree(res); fail_device: diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h index ad1b19aa6508..aef23309a742 100644 --- a/include/linux/mfd/core.h +++ b/include/linux/mfd/core.h @@ -86,16 +86,25 @@ extern int mfd_clone_cell(const char *cell, const char **clones, */ static inline const struct mfd_cell *mfd_get_cell(struct platform_device *pdev) { - return pdev->dev.platform_data; + return pdev->mfd_cell; } /* * Given a platform device that's been created by mfd_add_devices(), fetch * the .mfd_data entry from the mfd_cell that created it. + * Otherwise just return the platform_data pointer. + * This maintains compatibility with platform drivers whose devices aren't + * created by the mfd layer, and expect platform_data to contain what would've + * otherwise been in mfd_data. */ static inline void *mfd_get_data(struct platform_device *pdev) { - return mfd_get_cell(pdev)->mfd_data; + const struct mfd_cell *cell = mfd_get_cell(pdev); + + if (cell) + return cell->mfd_data; + else + return pdev->dev.platform_data; } extern int mfd_add_devices(struct device *parent, int id, diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index d96db9825708..744942c95fec 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -14,6 +14,8 @@ #include #include +struct mfd_cell; + struct platform_device { const char * name; int id; @@ -23,6 +25,9 @@ struct platform_device { const struct platform_device_id *id_entry; + /* MFD cell pointer */ + struct mfd_cell *mfd_cell; + /* arch specific additions */ struct pdev_archdata archdata; }; -- cgit v1.2.3-71-gd317 From be85bccaa5aa5a11dcaf85f9e945ffefd253f631 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Apr 2011 13:35:56 -0700 Subject: Revert "vfs: Export file system uuid via /proc//mountinfo" This reverts commit 93f1c20bc8cdb757be50566eff88d65c3b26881f. It turns out that libmount misparses it because it adds a '-' character in the uuid string, which libmount then incorrectly confuses with the separator string (" - ") at the end of all the optional arguments. Upstream libmount (in the util-linux tree) has been fixed, but until that fix actually percolates up to users, we'd better not expose this change in the kernel. Let's revisit this later (possibly by exposing the UUID without any '-' characters in it, avoiding the user-space bug). Reported-by: Dave Jones Cc: Aneesh Kumar K.V Cc: Al Viro Cc: Karel Zak Cc: Ram Pai Cc: Miklos Szeredi Cc: Eric Sandeen Signed-off-by: Linus Torvalds --- fs/namespace.c | 16 ---------------- include/linux/fs.h | 1 - 2 files changed, 17 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 7dba2ed03429..d99bcf59e4c2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1030,18 +1030,6 @@ const struct seq_operations mounts_op = { .show = show_vfsmnt }; -static int uuid_is_nil(u8 *uuid) -{ - int i; - u8 *cp = (u8 *)uuid; - - for (i = 0; i < 16; i++) { - if (*cp++) - return 0; - } - return 1; -} - static int show_mountinfo(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; @@ -1085,10 +1073,6 @@ static int show_mountinfo(struct seq_file *m, void *v) if (IS_MNT_UNBINDABLE(mnt)) seq_puts(m, " unbindable"); - if (!uuid_is_nil(mnt->mnt_sb->s_uuid)) - /* print the uuid */ - seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid); - /* Filesystem specific data */ seq_puts(m, " - "); show_type(m, sb); diff --git a/include/linux/fs.h b/include/linux/fs.h index dbd860af0804..5b14843af9f9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1403,7 +1403,6 @@ struct super_block { wait_queue_head_t s_wait_unfrozen; char s_id[32]; /* Informational name */ - u8 s_uuid[16]; /* UUID */ void *s_fs_info; /* Filesystem private info */ fmode_t s_mode; -- cgit v1.2.3-71-gd317 From 0bba01695b74fdd2f9286243bb39f88544d81401 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Apr 2011 15:21:04 -0700 Subject: vfs: Re-introduce s_uuid in the superblock Gaah. When commit be85bccaa5aa reverted the export of file system uuid via /proc//mountinfo, it also unintentionally removed the s_uuid field in struct super_block. I didn't mean to do that, since filesystems have been taught to fill it in (and we want to keep it for future re-introduction in the mountinfo file). Stupid of me. This adds it back in. Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5b14843af9f9..dbd860af0804 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1403,6 +1403,7 @@ struct super_block { wait_queue_head_t s_wait_unfrozen; char s_id[32]; /* Informational name */ + u8 s_uuid[16]; /* UUID */ void *s_fs_info; /* Filesystem private info */ fmode_t s_mode; -- cgit v1.2.3-71-gd317 From 184748cc50b2dceb8287f9fb657eda48ff8fcfe7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:39 +0200 Subject: sched: Provide scheduler_ipi() callback in response to smp_send_reschedule() For future rework of try_to_wake_up() we'd like to push part of that function onto the CPU the task is actually going to run on. In order to do so we need a generic callback from the existing scheduler IPI. This patch introduces such a generic callback: scheduler_ipi() and implements it as a NOP. BenH notes: PowerPC might use this IPI on offline CPUs under rare conditions! Acked-by: Russell King Acked-by: Martin Schwidefsky Acked-by: Chris Metcalf Acked-by: Jesper Nilsson Acked-by: Benjamin Herrenschmidt Signed-off-by: Ralf Baechle Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.744338123@chello.nl --- arch/alpha/kernel/smp.c | 3 +-- arch/arm/kernel/smp.c | 5 +---- arch/blackfin/mach-common/smp.c | 3 +++ arch/cris/arch-v32/kernel/smp.c | 13 ++++++++----- arch/ia64/kernel/irq_ia64.c | 2 ++ arch/ia64/xen/irq_xen.c | 10 +++++++++- arch/m32r/kernel/smp.c | 4 +--- arch/mips/cavium-octeon/smp.c | 2 ++ arch/mips/kernel/smtc.c | 2 +- arch/mips/mti-malta/malta-int.c | 2 ++ arch/mips/pmc-sierra/yosemite/smp.c | 4 ++++ arch/mips/sgi-ip27/ip27-irq.c | 2 ++ arch/mips/sibyte/bcm1480/smp.c | 7 +++---- arch/mips/sibyte/sb1250/smp.c | 7 +++---- arch/mn10300/kernel/smp.c | 5 +---- arch/parisc/kernel/smp.c | 5 +---- arch/powerpc/kernel/smp.c | 4 ++-- arch/s390/kernel/smp.c | 6 +++--- arch/sh/kernel/smp.c | 2 ++ arch/sparc/kernel/smp_32.c | 4 +++- arch/sparc/kernel/smp_64.c | 1 + arch/tile/kernel/smp.c | 6 +----- arch/um/kernel/smp.c | 2 +- arch/x86/kernel/smp.c | 5 ++--- arch/x86/xen/smp.c | 5 ++--- include/linux/sched.h | 2 ++ 26 files changed, 63 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 42aa078a5e4d..5a621c6d22ab 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs) switch (which) { case IPI_RESCHEDULE: - /* Reschedule callback. Everything to be done - is done by the interrupt return path. */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 8fe05ad932e4..7a561eb731ea 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -560,10 +560,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs) break; case IPI_RESCHEDULE: - /* - * nothing more to do - eveything is - * done on the interrupt return path - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 6e17a265c4d3..326bb86f4d29 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -164,6 +164,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance) while (msg_queue->count) { msg = &msg_queue->ipi_message[msg_queue->head]; switch (msg->type) { + case BFIN_IPI_RESCHEDULE: + scheduler_ipi(); + break; case BFIN_IPI_CALL_FUNC: spin_unlock_irqrestore(&msg_queue->lock, flags); ipi_call_function(cpu, msg); diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 4c9e3e1ba5d1..66cc75657e2f 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id) ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); + if (ipi.vector & IPI_SCHEDULE) { + scheduler_ipi(); + } if (ipi.vector & IPI_CALL) { - func(info); + func(info); } if (ipi.vector & IPI_FLUSH_TLB) { - if (flush_mm == FLUSH_ALL) - __flush_tlb_all(); - else if (flush_vma == FLUSH_ALL) + if (flush_mm == FLUSH_ALL) + __flush_tlb_all(); + else if (flush_vma == FLUSH_ALL) __flush_tlb_mm(flush_mm); - else + else __flush_tlb_page(flush_vma, flush_addr); } diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 5b704740f160..782c3a357f24 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) smp_local_flush_tlb(); kstat_incr_irqs_this_cpu(irq, desc); } else if (unlikely(IS_RESCHEDULE(vector))) { + scheduler_ipi(); kstat_incr_irqs_this_cpu(irq, desc); } else { ia64_setreg(_IA64_REG_CR_TPR, vector); diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c index 108bb858acf2..b279e142c633 100644 --- a/arch/ia64/xen/irq_xen.c +++ b/arch/ia64/xen/irq_xen.c @@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt; static int xen_slab_ready; #ifdef CONFIG_SMP +#include + /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, * it ends up to issue several memory accesses upon percpu data and * thus adds unnecessary traffic to other paths. @@ -99,7 +101,13 @@ static int xen_slab_ready; static irqreturn_t xen_dummy_handler(int irq, void *dev_id) { + return IRQ_HANDLED; +} +static irqreturn_t +xen_resched_handler(int irq, void *dev_id) +{ + scheduler_ipi(); return IRQ_HANDLED; } @@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = { }; static struct irqaction xen_resched_irqaction = { - .handler = xen_dummy_handler, + .handler = xen_resched_handler, .flags = IRQF_DISABLED, .name = "resched" }; diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 31cef20b2996..fc10b39893d4 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id) * * Description: This routine executes on CPU which received * 'RESCHEDULE_IPI'. - * Rescheduling is processed at the exit of interrupt - * operation. * * Born on Date: 2002.02.05 * @@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id) *==========================================================================*/ void smp_reschedule_interrupt(void) { - /* nothing to do */ + scheduler_ipi(); } /*==========================================================================* diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index ba78b21cc8d0..76923eeb58b9 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); /* Check if we've been told to flush the icache */ if (action & SMP_ICACHE_FLUSH) diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index 5a88cc4ccd5a..cedac4633741 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi) static void ipi_resched_interrupt(void) { - /* Return from interrupt should be enough to cause scheduler check */ + scheduler_ipi(); } static void ipi_call_interrupt(void) diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c index 9027061f0ead..7d93e6fbfa5a 100644 --- a/arch/mips/mti-malta/malta-int.c +++ b/arch/mips/mti-malta/malta-int.c @@ -309,6 +309,8 @@ static void ipi_call_dispatch(void) static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) { + scheduler_ipi(); + return IRQ_HANDLED; } diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c index efc9e889b349..2608752898c0 100644 --- a/arch/mips/pmc-sierra/yosemite/smp.c +++ b/arch/mips/pmc-sierra/yosemite/smp.c @@ -55,6 +55,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; case 1: @@ -63,6 +65,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; } } diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index 0a04603d577c..b18b04e48577 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c @@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void) #ifdef CONFIG_SMP if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); smp_call_function_interrupt(); diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c index 47b347c992ea..d667875be564 100644 --- a/arch/mips/sibyte/bcm1480/smp.c +++ b/arch/mips/sibyte/bcm1480/smp.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c index c00a5cb1128d..38e7f6bd7922 100644 --- a/arch/mips/sibyte/sb1250/smp.c +++ b/arch/mips/sibyte/sb1250/smp.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index 226c826a2194..83fb27912231 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c @@ -494,14 +494,11 @@ void smp_send_stop(void) * @irq: The interrupt number. * @dev_id: The device ID. * - * We need do nothing here, since the scheduling will be effected on our way - * back through entry.S. - * * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. */ static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) { - /* do nothing */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 69d63d354ef0..828305f19cff 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id) case IPI_RESCHEDULE: smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); - /* - * Reschedule callback. Everything to be - * done is done by the interrupt return path. - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index cbdbb14be4b0..9f9c204bef69 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -116,7 +116,7 @@ void smp_message_recv(int msg) generic_smp_call_function_interrupt(); break; case PPC_MSG_RESCHEDULE: - /* we notice need_resched on exit */ + scheduler_ipi(); break; case PPC_MSG_CALL_FUNC_SINGLE: generic_smp_call_function_single_interrupt(); @@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data) static irqreturn_t reschedule_action(int irq, void *data) { - /* we just need the return path side effect of checking need_resched */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 63a97db83f96..63c7d9ff220d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code, kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; /* * handle bit signal external calls - * - * For the ec_schedule signal we have to do nothing. All the work - * is done automatically when we return from the interrupt. */ bits = xchg(&S390_lowcore.ext_call_fast, 0); + if (test_bit(ec_schedule, &bits)) + scheduler_ipi(); + if (test_bit(ec_call_function, &bits)) generic_smp_call_function_interrupt(); diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 509b36b45115..6207561ea34a 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg) generic_smp_call_function_interrupt(); break; case SMP_MSG_RESCHEDULE: + scheduler_ipi(); break; case SMP_MSG_FUNCTION_SINGLE: generic_smp_call_function_single_interrupt(); diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 91c10fb70858..f95690c167b6 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -125,7 +125,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 }; void smp_send_reschedule(int cpu) { - /* See sparc64 */ + /* + * XXX missing reschedule IPI, see scheduler_ipi() + */ } void smp_send_stop(void) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 3e94a8c23238..9478da7fdb3e 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu) void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) { clear_softint(1 << irq); + scheduler_ipi(); } /* This is a nop because we capture all other cpus diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index a4293102ef81..c52224d5ed45 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end) /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ static irqreturn_t handle_reschedule_ipi(int irq, void *token) { - /* - * Nothing to do here; when we return from interrupt, the - * rescheduling will occur there. But do bump the interrupt - * profiler count in the meantime. - */ __get_cpu_var(irq_stat).irq_resched_count++; + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 106bf27e2a9a..eefb107d2d73 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -173,7 +173,7 @@ void IPI_handler(int cpu) break; case 'R': - set_tsk_need_resched(current); + scheduler_ipi(); break; case 'S': diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 513deac7228d..013e7eba83bb 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + scheduler_ipi(); /* * KVM uses this interrupt to force a cpu out of guest mode */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 30612441ed99..762b46ab14d5 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) { inc_irq_stat(irq_resched_count); + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ec2c027e92c..758e27afcda5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2189,8 +2189,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP +static inline void scheduler_ipi(void) { } extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else +static inline void scheduler_ipi(void) { } static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { -- cgit v1.2.3-71-gd317 From 3ca7a440da394808571dad32d33d3bc0389982e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:40 +0200 Subject: sched: Always provide p->on_cpu Always provide p->on_cpu so that we can determine if its on a cpu without having to lock the rq. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152728.785452014@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 +--- kernel/sched.c | 46 +++++++++++++++++++++++++++++----------------- 2 files changed, 30 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 758e27afcda5..3435837e89ff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1200,9 +1200,7 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - int oncpu; -#endif + int on_cpu; #endif int prio, static_prio, normal_prio; diff --git a/kernel/sched.c b/kernel/sched.c index a187c3fe027b..cd2593e1a3ec 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -838,18 +838,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->on_cpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -865,15 +886,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -882,7 +894,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) * SMP rebalancing from interrupt is the only thing that cares * here. */ - next->oncpu = 1; + next->on_cpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW raw_spin_unlock_irq(&rq->lock); @@ -895,12 +907,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->oncpu is cleared, the task can be moved to a different CPU. + * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. */ smp_wmb(); - prev->oncpu = 0; + prev->on_cpu = 0; #endif #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); @@ -2686,8 +2698,8 @@ void sched_fork(struct task_struct *p, int clone_flags) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - p->oncpu = 0; +#if defined(CONFIG_SMP) + p->on_cpu = 0; #endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ @@ -5776,8 +5788,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - idle->oncpu = 1; +#if defined(CONFIG_SMP) + idle->on_cpu = 1; #endif raw_spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.2.3-71-gd317 From c6eb3dda25892f1f974f5420f63e6721aab02f6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:41 +0200 Subject: mutex: Use p->on_cpu for the adaptive spin Since we now have p->on_cpu unconditionally available, use it to re-implement mutex_spin_on_owner. Requested-by: Thomas Gleixner Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.826338173@chello.nl --- include/linux/mutex.h | 2 +- include/linux/sched.h | 2 +- kernel/mutex-debug.c | 2 +- kernel/mutex-debug.h | 2 +- kernel/mutex.c | 2 +- kernel/mutex.h | 2 +- kernel/sched.c | 83 ++++++++++++++++++++------------------------------- 7 files changed, 39 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 94b48bd40dd7..c75471db576e 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -51,7 +51,7 @@ struct mutex { spinlock_t wait_lock; struct list_head wait_list; #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) - struct thread_info *owner; + struct task_struct *owner; #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; diff --git a/include/linux/sched.h b/include/linux/sched.h index 3435837e89ff..173850479e2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); +extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a960b5d..73da83aff418 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) return; DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); + DEBUG_LOCKS_WARN_ON(lock->owner != current); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a16f9d..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/mutex.c b/kernel/mutex.c index c4195fa98900..fe4706cb0c5b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -160,7 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ for (;;) { - struct thread_info *owner; + struct task_struct *owner; /* * If we own the BKL, then don't spin. The owner of diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca48f94..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -19,7 +19,7 @@ #ifdef CONFIG_SMP static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/sched.c b/kernel/sched.c index cd2593e1a3ec..55cc50323ce1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4173,70 +4173,53 @@ need_resched: EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) -{ - unsigned int cpu; - struct rq *rq; - if (!sched_feat(OWNER_SPIN)) - return 0; +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + bool ret = false; -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * Need to access the cpu field knowing that - * DEBUG_PAGEALLOC could have unmapped it if - * the mutex owner just released it and exited. - */ - if (probe_kernel_address(&owner->cpu, cpu)) - return 0; -#else - cpu = owner->cpu; -#endif + rcu_read_lock(); + if (lock->owner != owner) + goto fail; /* - * Even if the access succeeded (likely case), - * the cpu field may no longer be valid. + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. */ - if (cpu >= nr_cpumask_bits) - return 0; + barrier(); - /* - * We need to validate that we can do a - * get_cpu() and that we have the percpu area. - */ - if (!cpu_online(cpu)) - return 0; + ret = owner->on_cpu; +fail: + rcu_read_unlock(); - rq = cpu_rq(cpu); + return ret; +} - for (;;) { - /* - * Owner changed, break to re-assess state. - */ - if (lock->owner != owner) { - /* - * If the lock has switched to a different owner, - * we likely have heavy contention. Return 0 to quit - * optimistic spinning and not contend further: - */ - if (lock->owner) - return 0; - break; - } +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + if (!sched_feat(OWNER_SPIN)) + return 0; - /* - * Is that owner really running on that cpu? - */ - if (task_thread_info(rq->curr) != owner || need_resched()) + while (owner_running(lock, owner)) { + if (need_resched()) return 0; arch_mutex_cpu_relax(); } + /* + * If the owner changed to another task there is likely + * heavy contention, stop spinning. + */ + if (lock->owner) + return 0; + return 1; } #endif -- cgit v1.2.3-71-gd317 From fd2f4419b4cbe8fe90796df9617c355762afd6a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:44 +0200 Subject: sched: Provide p->on_rq Provide a generic p->on_rq because the p->se.on_rq semantics are unfavourable for lockless wakeups but needed for sched_fair. In particular, p->on_rq is only cleared when we actually dequeue the task in schedule() and not on any random dequeue as done by things like __migrate_task() and __sched_setscheduler(). This also allows us to remove p->se usage from !sched_fair code. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.949545047@chello.nl --- include/linux/sched.h | 1 + kernel/sched.c | 38 ++++++++++++++++++++------------------ kernel/sched_debug.c | 2 +- kernel/sched_rt.c | 16 ++++++++-------- kernel/sched_stoptask.c | 2 +- 5 files changed, 31 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 173850479e2c..b33a700652dc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1202,6 +1202,7 @@ struct task_struct { #ifdef CONFIG_SMP int on_cpu; #endif + int on_rq; int prio, static_prio, normal_prio; unsigned int rt_priority; diff --git a/kernel/sched.c b/kernel/sched.c index 4481638f9178..dece28e505c9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1785,7 +1785,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); - p->se.on_rq = 1; } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1793,7 +1792,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); - p->se.on_rq = 0; } /* @@ -2128,7 +2126,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -2203,7 +2201,7 @@ static bool migrate_task(struct task_struct *p, struct rq *rq) * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - return p->se.on_rq || task_running(rq, p); + return p->on_rq || task_running(rq, p); } /* @@ -2263,7 +2261,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->se.on_rq; + on_rq = p->on_rq; ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -2444,6 +2442,7 @@ ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags) static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); + p->on_rq = 1; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) @@ -2506,7 +2505,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, cpu = task_cpu(p); - if (p->se.on_rq) + if (p->on_rq) goto out_running; orig_cpu = cpu; @@ -2583,7 +2582,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) return; - if (!p->se.on_rq) + if (!p->on_rq) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_post_activation(p, rq, 0); @@ -2620,19 +2619,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(struct task_struct *p) { + p->on_rq = 0; + + p->se.on_rq = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif INIT_LIST_HEAD(&p->rt.run_list); - p->se.on_rq = 0; - INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2750,6 +2751,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); activate_task(rq, p, 0); + p->on_rq = 1; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -4051,7 +4053,7 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->se.on_rq) + if (prev->on_rq) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } @@ -4126,7 +4128,9 @@ need_resched: if (to_wakeup) try_to_wake_up_local(to_wakeup); } + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; /* * If we are going to sleep and we have plugged IO queued, make @@ -4695,7 +4699,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) dequeue_task(rq, p, 0); @@ -4743,7 +4747,7 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); @@ -4877,8 +4881,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) static void __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - BUG_ON(p->se.on_rq); - p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); @@ -5044,7 +5046,7 @@ recheck: raw_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) deactivate_task(rq, p, 0); @@ -5965,7 +5967,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->se.on_rq) { + if (p->on_rq) { deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); @@ -8339,7 +8341,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) int old_prio = p->prio; int on_rq; - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) deactivate_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); @@ -8682,7 +8684,7 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->se.on_rq; + on_rq = tsk->on_rq; if (on_rq) dequeue_task(rq, tsk, 0); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7bacd83a4158..3669bec6e130 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { - if (!p->se.on_rq || task_cpu(p) != rq_cpu) + if (!p->on_rq || task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e7cebdc65f82..9ca4f5f879c4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1136,7 +1136,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1287,7 +1287,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || - !task->se.on_rq)) { + !task->on_rq)) { raw_spin_unlock(&lowest_rq->lock); lowest_rq = NULL; @@ -1321,7 +1321,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->rt.nr_cpus_allowed <= 1); - BUG_ON(!p->se.on_rq); + BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); return p; @@ -1467,7 +1467,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->se.on_rq); + WARN_ON(!p->on_rq); /* * There's a chance that p is higher in priority @@ -1538,7 +1538,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Update the migration status of the RQ if we have an RT task * which is running AND changing its weight value. */ - if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { + if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { struct rq *rq = task_rq(p); if (!task_current(rq, p)) { @@ -1608,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (p->se.on_rq && !rq->rt.rt_nr_running) + if (p->on_rq && !rq->rt.rt_nr_running) pull_rt_task(rq); } @@ -1638,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->se.on_rq && rq->curr != p) { + if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->rt.overloaded && push_rt_task(rq) && /* Don't resched if we changed runqueues */ @@ -1657,7 +1657,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!p->on_rq) return; if (rq->curr == p) { diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 1ba2bd40fdac..f607de42e6fc 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -26,7 +26,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->se.on_rq) + if (stop && stop->on_rq) return stop; return NULL; -- cgit v1.2.3-71-gd317 From 7608dec2ce2004c234339bef8c8074e5e601d0e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:46 +0200 Subject: sched: Drop the rq argument to sched_class::select_task_rq() In preparation of calling select_task_rq() without rq->lock held, drop the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.031077745@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +-- kernel/sched.c | 20 +++++++++++--------- kernel/sched_fair.c | 2 +- kernel/sched_idletask.c | 2 +- kernel/sched_rt.c | 38 ++++++++++++++++++++++++++------------ kernel/sched_stoptask.c | 3 +-- 6 files changed, 41 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index b33a700652dc..ff4e2f9c24a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1067,8 +1067,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct rq *rq, struct task_struct *p, - int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); diff --git a/kernel/sched.c b/kernel/sched.c index d398f2f0a3c9..d4b815d345b3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2195,13 +2195,15 @@ static int migration_cpu_stop(void *data); * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static bool migrate_task(struct task_struct *p, struct rq *rq) +static bool need_migrate_task(struct task_struct *p) { /* * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - return p->on_rq || task_running(rq, p); + bool running = p->on_rq || p->on_cpu; + smp_rmb(); /* finish_lock_switch() */ + return running; } /* @@ -2376,9 +2378,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -2533,7 +2535,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, en_flags |= ENQUEUE_WAKING; } - cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); __task_rq_unlock(rq); @@ -2744,7 +2746,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * We set TASK_WAKING so that select_task_rq() can drop rq->lock * without people poking at ->cpus_allowed. */ - cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); set_task_cpu(p, cpu); p->state = TASK_RUNNING; @@ -3474,7 +3476,7 @@ void sched_exec(void) int dest_cpu; rq = task_rq_lock(p, &flags); - dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3482,7 +3484,7 @@ void sched_exec(void) * select_task_rq() can race against ->cpus_allowed */ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { + likely(cpu_active(dest_cpu)) && need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; task_rq_unlock(rq, &flags); @@ -5911,7 +5913,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, rq)) { + if (need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ __task_rq_unlock(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4ee50f0af8d1..96b2c95ac356 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1657,7 +1657,7 @@ static int select_idle_sibling(struct task_struct *p, int target) * preempt must be disabled. */ static int -select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a776a6396427..0a51882534ea 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -7,7 +7,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9ca4f5f879c4..19ecb3127379 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -977,13 +977,23 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { + struct task_struct *curr; + struct rq *rq; + int cpu; + if (sd_flag != SD_BALANCE_WAKE) return smp_processor_id(); + cpu = task_cpu(p); + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + /* - * If the current task is an RT task, then + * If the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -997,21 +1007,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * lock? * * For equal prio tasks, we just let the scheduler sort it out. + * + * Otherwise, just let it ride on the affined RQ and the + * post-schedule router will push the preempted task away + * + * This test is optimistic, if we get it wrong the load-balancer + * will have to sort it out. */ - if (unlikely(rt_task(rq->curr)) && - (rq->curr->rt.nr_cpus_allowed < 2 || - rq->curr->prio < p->prio) && + if (curr && unlikely(rt_task(curr)) && + (curr->rt.nr_cpus_allowed < 2 || + curr->prio < p->prio) && (p->rt.nr_cpus_allowed > 1)) { - int cpu = find_lowest_rq(p); + int target = find_lowest_rq(p); - return (cpu == -1) ? task_cpu(p) : cpu; + if (target != -1) + cpu = target; } + rcu_read_unlock(); - /* - * Otherwise, just let it ride on the affined RQ and the - * post-schedule router will push the preempted task away - */ - return task_cpu(p); + return cpu; } static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index f607de42e6fc..6f437632afab 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -9,8 +9,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct rq *rq, struct task_struct *p, - int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } -- cgit v1.2.3-71-gd317 From 74f8e4b2335de45485b8d5b31a504747f13c8070 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:47 +0200 Subject: sched: Remove rq argument to sched_class::task_waking() In preparation of calling this without rq->lock held, remove the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.071474242@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++++--- kernel/sched.c | 2 +- kernel/sched_fair.c | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ff4e2f9c24a7..7f5732f8c618 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1048,8 +1048,12 @@ struct sched_domain; #define WF_FORK 0x02 /* child wakeup after fork */ #define ENQUEUE_WAKEUP 1 -#define ENQUEUE_WAKING 2 -#define ENQUEUE_HEAD 4 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif #define DEQUEUE_SLEEP 1 @@ -1071,7 +1075,7 @@ struct sched_class { void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); - void (*task_waking) (struct rq *this_rq, struct task_struct *task); + void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, diff --git a/kernel/sched.c b/kernel/sched.c index d4b815d345b3..46f42cac4eb1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2531,7 +2531,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, p->state = TASK_WAKING; if (p->sched_class->task_waking) { - p->sched_class->task_waking(rq, p); + p->sched_class->task_waking(p); en_flags |= ENQUEUE_WAKING; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 96b2c95ac356..ad4c414f456d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1372,11 +1372,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static void task_waking_fair(struct rq *rq, struct task_struct *p) +static void task_waking_fair(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + lockdep_assert_held(&task_rq(p)->lock); + se->vruntime -= cfs_rq->min_vruntime; } -- cgit v1.2.3-71-gd317 From a8e4f2eaecc9bfa4954adf79a04f4f22fddd829c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:49 +0200 Subject: sched: Delay task_contributes_to_load() In prepratation of having to call task_contributes_to_load() without holding rq->lock, we need to store the result until we do and can update the rq accounting accordingly. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.151523907@chello.nl --- include/linux/sched.h | 1 + kernel/sched.c | 16 ++++------------ 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 7f5732f8c618..25c50317ddc1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1273,6 +1273,7 @@ struct task_struct { /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; pid_t pid; pid_t tgid; diff --git a/kernel/sched.c b/kernel/sched.c index 7a5eb2620785..fd32b78c123c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2519,18 +2519,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (unlikely(task_running(rq, p))) goto out_activate; - /* - * In order to handle concurrent wakeups and release the rq->lock - * we put the task in TASK_WAKING state. - * - * First fix up the nr_uninterruptible count: - */ - if (task_contributes_to_load(p)) { - if (likely(cpu_online(orig_cpu))) - rq->nr_uninterruptible--; - else - this_rq()->nr_uninterruptible--; - } + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; if (p->sched_class->task_waking) { @@ -2555,6 +2544,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, WARN_ON(task_cpu(p) != cpu); WARN_ON(p->state != TASK_WAKING); + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; + out_activate: #endif /* CONFIG_SMP */ ttwu_activate(rq, p, en_flags); -- cgit v1.2.3-71-gd317 From 317f394160e9beb97d19a84c39b7e5eb3d7815a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:58 +0200 Subject: sched: Move the second half of ttwu() to the remote cpu Now that we've removed the rq->lock requirement from the first part of ttwu() and can compute placement without holding any rq->lock, ensure we execute the second half of ttwu() on the actual cpu we want the task to run on. This avoids having to take rq->lock and doing the task enqueue remotely, saving lots on cacheline transfers. As measured using: http://oss.oracle.com/~mason/sembench.c $ for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ; do echo performance > $i; done $ echo 4096 32000 64 128 > /proc/sys/kernel/sem $ ./sembench -t 2048 -w 1900 -o 0 unpatched: run time 30 seconds 647278 worker burns per second patched: run time 30 seconds 816715 worker burns per second Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.515897185@chello.nl --- include/linux/sched.h | 3 ++- init/Kconfig | 5 +++++ kernel/sched.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched_features.h | 6 ++++++ 4 files changed, 69 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 25c50317ddc1..e09dafa6e149 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1203,6 +1203,7 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP + struct task_struct *wake_entry; int on_cpu; #endif int on_rq; @@ -2192,7 +2193,7 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP -static inline void scheduler_ipi(void) { } +void scheduler_ipi(void); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else static inline void scheduler_ipi(void) { } diff --git a/init/Kconfig b/init/Kconfig index 56240e724d9a..32745bfe059e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -827,6 +827,11 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TTWU_QUEUE + bool + depends on !SPARC32 + default y + config MM_OWNER bool diff --git a/kernel/sched.c b/kernel/sched.c index 7d8b85fcdf06..9e3ede120e81 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -556,6 +556,10 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; #endif + +#ifdef CONFIG_SMP + struct task_struct *wake_list; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -2516,10 +2520,61 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) return ret; } +#ifdef CONFIG_SMP +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + raw_spin_lock(&rq->lock); + + while (list) { + struct task_struct *p = list; + list = list->wake_entry; + ttwu_do_activate(rq, p, 0); + } + + raw_spin_unlock(&rq->lock); +} + +void scheduler_ipi(void) +{ + sched_ttwu_pending(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *next = rq->wake_list; + + for (;;) { + struct task_struct *old = next; + + p->wake_entry = next; + next = cmpxchg(&rq->wake_list, old, p); + if (next == old) + break; + } + + if (!next) + smp_send_reschedule(cpu); +} +#endif + static void ttwu_queue(struct task_struct *p, int cpu) { struct rq *rq = cpu_rq(cpu); +#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) + if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + ttwu_queue_remote(p, cpu); + return; + } +#endif + raw_spin_lock(&rq->lock); ttwu_do_activate(rq, p, 0); raw_spin_unlock(&rq->lock); @@ -6331,6 +6386,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_DYING: + sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69acc29b9..be40f7371ee1 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1) * Decrement CPU power based on irq activity */ SCHED_FEAT(NONIRQ_POWER, 1) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, 1) -- cgit v1.2.3-71-gd317