From dd935f44a40f8fb02aff2cc0df2269c92422df1c Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Wed, 28 Aug 2013 21:43:09 -0700 Subject: libceph: add function to ensure notifies are complete Without a way to flush the osd client's notify workqueue, a watch event that is unregistered could continue receiving callbacks indefinitely. Unregistering the event simply means no new notifies are added to the queue, but there may still be events in the queue that will call the watch callback for the event. If the queue is flushed after the event is unregistered, the caller can be sure no more watch callbacks will occur for the canceled watch. Signed-off-by: Josh Durgin Reviewed-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/osd_client.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ce6df39f60ff..8f47625a0661 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -335,6 +335,8 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); extern void ceph_osdc_sync(struct ceph_osd_client *osdc); +extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); + extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct ceph_file_layout *layout, -- cgit v1.2.3-71-gd317 From 7bd36014460f793c19e7d6c94dab67b0afcfcb7f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Sep 2013 16:50:56 -0700 Subject: timekeeping: Fix HRTICK related deadlock from ntp lock changes Gerlando Falauto reported that when HRTICK is enabled, it is possible to trigger system deadlocks. These were hard to reproduce, as HRTICK has been broken in the past, but seemed to be connected to the timekeeping_seq lock. Since seqlock/seqcount's aren't supported w/ lockdep, I added some extra spinlock based locking and triggered the following lockdep output: [ 15.849182] ntpd/4062 is trying to acquire lock: [ 15.849765] (&(&pool->lock)->rlock){..-...}, at: [] __queue_work+0x145/0x480 [ 15.850051] [ 15.850051] but task is already holding lock: [ 15.850051] (timekeeper_lock){-.-.-.}, at: [] do_adjtimex+0x7f/0x100 [ 15.850051] Chain exists of: &(&pool->lock)->rlock --> &p->pi_lock --> timekeeper_lock [ 15.850051] Possible unsafe locking scenario: [ 15.850051] [ 15.850051] CPU0 CPU1 [ 15.850051] ---- ---- [ 15.850051] lock(timekeeper_lock); [ 15.850051] lock(&p->pi_lock); [ 15.850051] lock(timekeeper_lock); [ 15.850051] lock(&(&pool->lock)->rlock); [ 15.850051] [ 15.850051] *** DEADLOCK *** The deadlock was introduced by 06c017fdd4dc48451a ("timekeeping: Hold timekeepering locks in do_adjtimex and hardpps") in 3.10 This patch avoids this deadlock, by moving the call to schedule_delayed_work() outside of the timekeeper lock critical section. Reported-by: Gerlando Falauto Tested-by: Lin Ming Signed-off-by: John Stultz Cc: Mathieu Desnoyers Cc: stable #3.11, 3.10 Link: http://lkml.kernel.org/r/1378943457-27314-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/timex.h | 1 + kernel/time/ntp.c | 6 ++---- kernel/time/timekeeping.c | 2 ++ 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index b3726e61368e..dd3edd7dfc94 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -141,6 +141,7 @@ extern int do_adjtimex(struct timex *); extern void hardpps(const struct timespec *, const struct timespec *); int read_current_timer(unsigned long *timer_val); +void ntp_notify_cmos_timer(void); /* The clock frequency of the i8253/i8254 PIT */ #define PIT_TICK_RATE 1193182ul diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8f5b3b98577b..bb2215174f05 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -516,13 +516,13 @@ static void sync_cmos_clock(struct work_struct *work) schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } -static void notify_cmos_timer(void) +void ntp_notify_cmos_timer(void) { schedule_delayed_work(&sync_cmos_work, 0); } #else -static inline void notify_cmos_timer(void) { } +void ntp_notify_cmos_timer(void) { } #endif @@ -687,8 +687,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; - notify_cmos_timer(); - return result; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 48b9fffabdc2..947ba25a95a0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1703,6 +1703,8 @@ int do_adjtimex(struct timex *txc) write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + ntp_notify_cmos_timer(); + return ret; } -- cgit v1.2.3-71-gd317 From 35e4237973665c8a1ad4e3f7a7cb87573deaa24a Mon Sep 17 00:00:00 2001 From: Joseph Gasparakis Date: Fri, 13 Sep 2013 07:34:13 -0700 Subject: vxlan: Fix sparse warnings This patch fixes sparse warnings when incorrectly handling the port number and using int instead of unsigned int iterating through &vn->sock_list[]. Keeping the port as __be16 also makes things clearer wrt endianess. Also, it was pointed out that vxlan_get_rx_port() had unnecessary checks which got removed. Signed-off-by: Joseph Gasparakis Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 18 ++++++++---------- include/linux/netdevice.h | 8 ++++---- 2 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index bf64b4191dcc..2400b1beddd5 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -564,7 +564,7 @@ static void vxlan_notify_add_rx_port(struct sock *sk) struct net_device *dev; struct net *net = sock_net(sk); sa_family_t sa_family = sk->sk_family; - u16 port = htons(inet_sk(sk)->inet_sport); + __be16 port = inet_sk(sk)->inet_sport; rcu_read_lock(); for_each_netdev_rcu(net, dev) { @@ -581,7 +581,7 @@ static void vxlan_notify_del_rx_port(struct sock *sk) struct net_device *dev; struct net *net = sock_net(sk); sa_family_t sa_family = sk->sk_family; - u16 port = htons(inet_sk(sk)->inet_sport); + __be16 port = inet_sk(sk)->inet_sport; rcu_read_lock(); for_each_netdev_rcu(net, dev) { @@ -2021,7 +2021,8 @@ static struct device_type vxlan_type = { }; /* Calls the ndo_add_vxlan_port of the caller in order to - * supply the listening VXLAN udp ports. + * supply the listening VXLAN udp ports. Callers are expected + * to implement the ndo_add_vxlan_port. */ void vxlan_get_rx_port(struct net_device *dev) { @@ -2029,16 +2030,13 @@ void vxlan_get_rx_port(struct net_device *dev) struct net *net = dev_net(dev); struct vxlan_net *vn = net_generic(net, vxlan_net_id); sa_family_t sa_family; - u16 port; - int i; - - if (!dev || !dev->netdev_ops || !dev->netdev_ops->ndo_add_vxlan_port) - return; + __be16 port; + unsigned int i; spin_lock(&vn->sock_lock); for (i = 0; i < PORT_HASH_SIZE; ++i) { - hlist_for_each_entry_rcu(vs, vs_head(net, i), hlist) { - port = htons(inet_sk(vs->sock->sk)->inet_sport); + hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { + port = inet_sk(vs->sock->sk)->inet_sport; sa_family = vs->sock->sk->sk_family; dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, port); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 041b42a305f6..3de49aca4519 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -950,14 +950,14 @@ struct netdev_phys_port_id { * multiple net devices on single physical port. * * void (*ndo_add_vxlan_port)(struct net_device *dev, - * sa_family_t sa_family, __u16 port); + * sa_family_t sa_family, __be16 port); * Called by vxlan to notiy a driver about the UDP port and socket * address family that vxlan is listnening to. It is called only when * a new port starts listening. The operation is protected by the * vxlan_net->sock_lock. * * void (*ndo_del_vxlan_port)(struct net_device *dev, - * sa_family_t sa_family, __u16 port); + * sa_family_t sa_family, __be16 port); * Called by vxlan to notify the driver about a UDP port and socket * address family that vxlan is not listening to anymore. The operation * is protected by the vxlan_net->sock_lock. @@ -1093,10 +1093,10 @@ struct net_device_ops { struct netdev_phys_port_id *ppid); void (*ndo_add_vxlan_port)(struct net_device *dev, sa_family_t sa_family, - __u16 port); + __be16 port); void (*ndo_del_vxlan_port)(struct net_device *dev, sa_family_t sa_family, - __u16 port); + __be16 port); }; /* -- cgit v1.2.3-71-gd317 From 0f1799ba1a5db4c48b72ac2da2dc70d8c190a73d Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 16 Sep 2013 20:04:53 +0200 Subject: netfilter: ipset: Consistent userspace testing with nomatch flag The "nomatch" commandline flag should invert the matching at testing, similarly to the --return-nomatch flag of the "set" match of iptables. Until now it worked with the elements with "nomatch" flag only. From now on it works with elements without the flag too, i.e: # ipset n test hash:net # ipset a test 10.0.0.0/24 nomatch # ipset t test 10.0.0.1 10.0.0.1 is NOT in set test. # ipset t test 10.0.0.1 nomatch 10.0.0.1 is in set test. # ipset a test 192.168.0.0/24 # ipset t test 192.168.0.1 192.168.0.1 is in set test. # ipset t test 192.168.0.1 nomatch 192.168.0.1 is NOT in set test. Before the patch the results were ... # ipset t test 192.168.0.1 192.168.0.1 is in set test. # ipset t test 192.168.0.1 nomatch 192.168.0.1 is in set test. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 6 ++++-- net/netfilter/ipset/ip_set_core.c | 3 +-- net/netfilter/ipset/ip_set_hash_ipportnet.c | 4 ++-- net/netfilter/ipset/ip_set_hash_net.c | 4 ++-- net/netfilter/ipset/ip_set_hash_netiface.c | 4 ++-- net/netfilter/ipset/ip_set_hash_netport.c | 4 ++-- 6 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index d80e2753847c..9ac9fbde7b61 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -296,10 +296,12 @@ ip_set_eexist(int ret, u32 flags) /* Match elements marked with nomatch */ static inline bool -ip_set_enomatch(int ret, u32 flags, enum ipset_adt adt) +ip_set_enomatch(int ret, u32 flags, enum ipset_adt adt, struct ip_set *set) { return adt == IPSET_TEST && - ret == -ENOTEMPTY && ((flags >> 16) & IPSET_FLAG_NOMATCH); + (set->type->features & IPSET_TYPE_NOMATCH) && + ((flags >> 16) & IPSET_FLAG_NOMATCH) && + (ret > 0 || ret == -ENOTEMPTY); } /* Check the NLA_F_NET_BYTEORDER flag */ diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index f77139007983..c8c303c3386f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1489,8 +1489,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, if (ret == -EAGAIN) ret = 1; - return (ret < 0 && ret != -ENOTEMPTY) ? ret : - ret > 0 ? 0 : -IPSET_ERR_EXIST; + return ret > 0 ? 0 : -IPSET_ERR_EXIST; } /* Get headed data of a set */ diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index c6a525373be4..f15f3e28b9c3 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -260,7 +260,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], e.ip = htonl(ip); e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt) ? 1 : + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } @@ -544,7 +544,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt) ? 1 : + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index da740ceb56ae..223e9f546d0f 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -199,7 +199,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt) ? 1 : + return ip_set_enomatch(ret, flags, adt, set) ? -ret: ip_set_eexist(ret, flags) ? 0 : ret; } @@ -396,7 +396,7 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt) ? 1 : + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 84ae6f6ce624..7d798d5d5cd3 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -368,7 +368,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt) ? 1 : + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } @@ -634,7 +634,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt) ? 1 : + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 9a0869853be5..09d6690bee6f 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -244,7 +244,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) { e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1)); ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt) ? 1 : + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } @@ -489,7 +489,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); - return ip_set_enomatch(ret, flags, adt) ? 1 : + return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } -- cgit v1.2.3-71-gd317 From 7c2330f1afe13b3a934140857bc8060d00103a89 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Sep 2013 18:08:02 -0700 Subject: regulator: fix fatal kernel-doc error Fix fatal kernel-doc error in : Error(include/linux/regulator/driver.h:52): cannot understand prototype: 'struct regulator_linear_range ' Signed-off-by: Randy Dunlap [Rewrote first line -- broonie] Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 67e13aa5a478..9bdad43ad228 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -40,6 +40,8 @@ enum regulator_status { }; /** + * struct regulator_linear_range - specify linear voltage ranges + * * Specify a range of voltages for regulator_map_linar_range() and * regulator_list_linear_range(). * -- cgit v1.2.3-71-gd317 From f84cb8a46a771f36a04a02c61ea635c968ed5f6a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 19 Sep 2013 12:13:58 -0400 Subject: dm mpath: disable WRITE SAME if it fails Workaround the SCSI layer's problematic WRITE SAME heuristics by disabling WRITE SAME in the DM multipath device's queue_limits if an underlying device disabled it. The WRITE SAME heuristics, with both the original commit 5db44863b6eb ("[SCSI] sd: Implement support for WRITE SAME") and the updated commit 66c28f971 ("[SCSI] sd: Update WRITE SAME heuristics"), default to enabling WRITE SAME(10) even without successfully determining it is supported. After the first failed WRITE SAME the SCSI layer will disable WRITE SAME for the device (by setting sdkp->device->no_write_same which results in 'max_write_same_sectors' in device's queue_limits to be set to 0). When a device is stacked ontop of such a SCSI device any changes to that SCSI device's queue_limits do not automatically propagate up the stack. As such, a DM multipath device will not have its WRITE SAME support disabled. This causes the block layer to continue to issue WRITE SAME requests to the mpath device which causes paths to fail and (if mpath IO isn't configured to queue when no paths are available) it will result in actual IO errors to the upper layers. This fix doesn't help configurations that have additional devices stacked ontop of the mpath device (e.g. LVM created linear DM devices ontop). A proper fix that restacks all the queue_limits from the bottom of the device stack up will need to be explored if SCSI will continue to use this model of optimistically allowing op codes and then disabling them after they fail for the first time. Before this patch: EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null) device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121) device-mapper: multipath: XXX snitm debugging: failing WRITE SAME IO with error=-121 end_request: critical target error, dev dm-6, sector 528 dm-6: WRITE SAME failed. Manually zeroing. device-mapper: multipath: Failing path 8:112. end_request: I/O error, dev dm-6, sector 4616 dm-6: WRITE SAME failed. Manually zeroing. end_request: I/O error, dev dm-6, sector 4616 end_request: I/O error, dev dm-6, sector 5640 end_request: I/O error, dev dm-6, sector 6664 end_request: I/O error, dev dm-6, sector 7688 end_request: I/O error, dev dm-6, sector 524288 Buffer I/O error on device dm-6, logical block 65536 lost page write due to I/O error on dm-6 JBD2: Error -5 detected when updating journal superblock for dm-6-8. end_request: I/O error, dev dm-6, sector 524296 Aborting journal on device dm-6-8. end_request: I/O error, dev dm-6, sector 524288 Buffer I/O error on device dm-6, logical block 65536 lost page write due to I/O error on dm-6 JBD2: Error -5 detected when updating journal superblock for dm-6-8. # cat /sys/block/sdh/queue/write_same_max_bytes 0 # cat /sys/block/dm-6/queue/write_same_max_bytes 33553920 After this patch: EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null) device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121) device-mapper: multipath: XXX snitm debugging: WRITE SAME I/O failed with error=-121 end_request: critical target error, dev dm-6, sector 528 dm-6: WRITE SAME failed. Manually zeroing. # cat /sys/block/sdh/queue/write_same_max_bytes 0 # cat /sys/block/dm-6/queue/write_same_max_bytes 0 It should be noted that WRITE SAME support wasn't enabled in DM multipath until v3.10. Signed-off-by: Mike Snitzer Cc: Martin K. Petersen Cc: Hannes Reinecke Cc: stable@vger.kernel.org # 3.10+ --- drivers/md/dm-mpath.c | 11 ++++++++++- drivers/md/dm.c | 11 +++++++++++ include/linux/device-mapper.h | 3 ++- 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 075747e25f77..e08be94959fd 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1299,8 +1299,17 @@ static int do_end_io(struct multipath *m, struct request *clone, if (!error && !clone->errors) return 0; /* I/O complete */ - if (noretry_error(error)) + if (noretry_error(error)) { + if ((clone->cmd_flags & REQ_WRITE_SAME) && + !clone->q->limits.max_write_same_sectors) { + struct queue_limits *limits; + + /* device doesn't really support WRITE SAME, disable it */ + limits = dm_get_queue_limits(dm_table_get_md(m->ti->table)); + limits->max_write_same_sectors = 0; + } return error; + } if (mpio->pgpath) fail_path(mpio->pgpath); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6a5e9ed2fcc3..7b0ccdc5d65c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2277,6 +2277,17 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md) return md->immutable_target_type; } +/* + * The queue_limits are only valid as long as you have a reference + * count on 'md'. + */ +struct queue_limits *dm_get_queue_limits(struct mapped_device *md) +{ + BUG_ON(!atomic_read(&md->holders)); + return &md->queue->limits; +} +EXPORT_SYMBOL_GPL(dm_get_queue_limits); + /* * Fully initialize a request-based queue (->elevator, ->request_fn, etc). */ diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 653073de09e3..ed419c62dde1 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -406,13 +406,14 @@ int dm_noflush_suspending(struct dm_target *ti); union map_info *dm_get_mapinfo(struct bio *bio); union map_info *dm_get_rq_mapinfo(struct request *rq); +struct queue_limits *dm_get_queue_limits(struct mapped_device *md); + /* * Geometry functions. */ int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo); int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo); - /*----------------------------------------------------------------- * Functions for manipulating device-mapper tables. *---------------------------------------------------------------*/ -- cgit v1.2.3-71-gd317 From c26d436cbf7a9549ec1073480a2e3f0d3f64e02d Mon Sep 17 00:00:00 2001 From: Andre Naujoks Date: Fri, 13 Sep 2013 19:37:12 +0200 Subject: lib: introduce upper case hex ascii helpers To be able to use the hex ascii functions in case sensitive environments the array hex_asc_upper[] and the needed functions for hex_byte_pack_upper() are introduced. Signed-off-by: Andre Naujoks Signed-off-by: David S. Miller --- include/linux/kernel.h | 11 +++++++++++ lib/hexdump.c | 2 ++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 482ad2d84a32..672ddc4de4af 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -439,6 +439,17 @@ static inline char *hex_byte_pack(char *buf, u8 byte) return buf; } +extern const char hex_asc_upper[]; +#define hex_asc_upper_lo(x) hex_asc_upper[((x) & 0x0f)] +#define hex_asc_upper_hi(x) hex_asc_upper[((x) & 0xf0) >> 4] + +static inline char *hex_byte_pack_upper(char *buf, u8 byte) +{ + *buf++ = hex_asc_upper_hi(byte); + *buf++ = hex_asc_upper_lo(byte); + return buf; +} + static inline char * __deprecated pack_hex_byte(char *buf, u8 byte) { return hex_byte_pack(buf, byte); diff --git a/lib/hexdump.c b/lib/hexdump.c index 3f0494c9d57a..8499c810909a 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -14,6 +14,8 @@ const char hex_asc[] = "0123456789abcdef"; EXPORT_SYMBOL(hex_asc); +const char hex_asc_upper[] = "0123456789ABCDEF"; +EXPORT_SYMBOL(hex_asc_upper); /** * hex_to_bin - convert a hex digit to its real value -- cgit v1.2.3-71-gd317 From 75afb352991ff1cd3cf5955bfe611de6d83a0c87 Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Sat, 21 Sep 2013 13:57:47 -0600 Subject: block: Add nr_bios to block_rq_remap tracepoint Adding the number of bios in a remapped request to 'block_rq_remap' tracepoint. Request remapper clones bios in a request to track the completion status of each bio. So the number of bios can be useful information for investigation. Related discussions: http://www.redhat.com/archives/dm-devel/2013-August/msg00084.html http://www.redhat.com/archives/dm-devel/2013-September/msg00024.html Signed-off-by: Jun'ichi Nomura Acked-by: Mike Snitzer Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 +++++++++++ include/trace/events/block.h | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2fdb4a451b49..0e6f765aa1f5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -862,6 +862,17 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq) return blk_queue_get_max_sectors(q, rq->cmd_flags); } +static inline unsigned int blk_rq_count_bios(struct request *rq) +{ + unsigned int nr_bios = 0; + struct bio *bio; + + __rq_for_each_bio(bio, rq) + nr_bios++; + + return nr_bios; +} + /* * Request issue related functions. */ diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 60ae7c3db912..4c2301d2ef1a 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -618,6 +618,7 @@ TRACE_EVENT(block_rq_remap, __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) + __field( unsigned int, nr_bios ) __array( char, rwbs, RWBS_LEN) ), @@ -627,15 +628,16 @@ TRACE_EVENT(block_rq_remap, __entry->nr_sector = blk_rq_sectors(rq); __entry->old_dev = dev; __entry->old_sector = from; + __entry->nr_bios = blk_rq_count_bios(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); ), - TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", + TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), - (unsigned long long)__entry->old_sector) + (unsigned long long)__entry->old_sector, __entry->nr_bios) ); #endif /* _TRACE_BLOCK_H */ -- cgit v1.2.3-71-gd317 From 82aeef0bf03684b377678c00c05e613f30dca39c Mon Sep 17 00:00:00 2001 From: "Li, Zhen-Hua" Date: Fri, 13 Sep 2013 14:27:32 +0800 Subject: x86/iommu: correct ICS register offset According to Intel Vt-D specs, the offset of Invalidation complete status register should be 0x9C, not 0x98. See Intel's VT-d spec, Revision 1.3, Chapter 10.4, Page 98; Signed-off-by: Li, Zhen-Hua Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 78e2ada50cd5..d380c5e68008 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -55,7 +55,7 @@ #define DMAR_IQT_REG 0x88 /* Invalidation queue tail register */ #define DMAR_IQ_SHIFT 4 /* Invalidation queue head/tail shift */ #define DMAR_IQA_REG 0x90 /* Invalidation queue addr register */ -#define DMAR_ICS_REG 0x98 /* Invalidation complete status register */ +#define DMAR_ICS_REG 0x9c /* Invalidation complete status register */ #define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr register */ #define OFFSET_STRIDE (9) -- cgit v1.2.3-71-gd317 From 9809b18fcf6b8d8ec4d3643677345907e6b50eca Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 24 Sep 2013 15:27:30 -0700 Subject: watchdog: update watchdog_thresh properly watchdog_tresh controls how often nmi perf event counter checks per-cpu hrtimer_interrupts counter and blows up if the counter hasn't changed since the last check. The counter is updated by per-cpu watchdog_hrtimer hrtimer which is scheduled with 2/5 watchdog_thresh period which guarantees that hrtimer is scheduled 2 times per the main period. Both hrtimer and perf event are started together when the watchdog is enabled. So far so good. But... But what happens when watchdog_thresh is updated from sysctl handler? proc_dowatchdog will set a new sampling period and hrtimer callback (watchdog_timer_fn) will use the new value in the next round. The problem, however, is that nobody tells the perf event that the sampling period has changed so it is ticking with the period configured when it has been set up. This might result in an ear ripping dissonance between perf and hrtimer parts if the watchdog_thresh is increased. And even worse it might lead to KABOOM if the watchdog is configured to panic on such a spurious lockup. This patch fixes the issue by updating both nmi perf even counter and hrtimers if the threshold value has changed. The nmi one is disabled and then reinitialized from scratch. This has an unpleasant side effect that the allocation of the new event might fail theoretically so the hard lockup detector would be disabled for such cpus. On the other hand such a memory allocation failure is very unlikely because the original event is deallocated right before. It would be much nicer if we just changed perf event period but there doesn't seem to be any API to do that right now. It is also unfortunate that perf_event_alloc uses GFP_KERNEL allocation unconditionally so we cannot use on_each_cpu() and do the same thing from the per-cpu context. The update from the current CPU should be safe because perf_event_disable removes the event atomically before it clears the per-cpu watchdog_ev so it cannot change anything under running handler feet. The hrtimer is simply restarted (thanks to Don Zickus who has pointed this out) if it is queued because we cannot rely it will fire&adopt to the new sampling period before a new nmi event triggers (when the treshold is decreased). [akpm@linux-foundation.org: the UP version of __smp_call_function_single ended up in the wrong place] Signed-off-by: Michal Hocko Acked-by: Don Zickus Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Fabio Estevam Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 6 ++++++ kernel/watchdog.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index cfb7ca094b38..731f5237d5f4 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -155,6 +155,12 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, static inline void kick_all_cpus_sync(void) { } +static inline void __smp_call_function_single(int cpuid, + struct call_single_data *data, int wait) +{ + on_each_cpu(data->func, data->info, wait); +} + #endif /* !SMP */ /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ced7d0609931..4431610f049a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -static int watchdog_enable_all_cpus(void) +static void restart_watchdog_hrtimer(void *info) +{ + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + int ret; + + /* + * No need to cancel and restart hrtimer if it is currently executing + * because it will reprogram itself with the new period now. + * We should never see it unqueued here because we are running per-cpu + * with interrupts disabled. + */ + ret = hrtimer_try_to_cancel(hrtimer); + if (ret == 1) + hrtimer_start(hrtimer, ns_to_ktime(sample_period), + HRTIMER_MODE_REL_PINNED); +} + +static void update_timers(int cpu) +{ + struct call_single_data data = {.func = restart_watchdog_hrtimer}; + /* + * Make sure that perf event counter will adopt to a new + * sampling period. Updating the sampling period directly would + * be much nicer but we do not have an API for that now so + * let's use a big hammer. + * Hrtimer will adopt the new period on the next tick but this + * might be late already so we have to restart the timer as well. + */ + watchdog_nmi_disable(cpu); + __smp_call_function_single(cpu, &data, 1); + watchdog_nmi_enable(cpu); +} + +static void update_timers_all_cpus(void) +{ + int cpu; + + get_online_cpus(); + preempt_disable(); + for_each_online_cpu(cpu) + update_timers(cpu); + preempt_enable(); + put_online_cpus(); +} + +static int watchdog_enable_all_cpus(bool sample_period_changed) { int err = 0; @@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void) pr_err("Failed to create watchdog threads, disabled\n"); else watchdog_running = 1; + } else if (sample_period_changed) { + update_timers_all_cpus(); } return err; @@ -537,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, * watchdog_*_all_cpus() function takes care of this. */ if (watchdog_user_enabled && watchdog_thresh) - err = watchdog_enable_all_cpus(); + err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); else watchdog_disable_all_cpus(); @@ -557,5 +604,5 @@ void __init lockup_detector_init(void) set_sample_period(); if (watchdog_user_enabled) - watchdog_enable_all_cpus(); + watchdog_enable_all_cpus(false); } -- cgit v1.2.3-71-gd317 From 694fbc0fe78518d06efa63910bf4ecee660e7852 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:37 -0700 Subject: revert "memcg: enhance memcg iterator to support predicates" Revert commit de57780dc659 ("memcg: enhance memcg iterator to support predicates") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 49 ++++---------------------------- mm/memcontrol.c | 70 ++++++++++------------------------------------ mm/vmscan.c | 16 +++++++---- 3 files changed, 32 insertions(+), 103 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 60e95872da29..ef2b9bd7fafa 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -53,23 +53,6 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; -enum mem_cgroup_filter_t { - VISIT, /* visit current node */ - SKIP, /* skip the current node and continue traversal */ - SKIP_TREE, /* skip the whole subtree and continue traversal */ -}; - -/* - * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to - * iterate through the hierarchy tree. Each tree element is checked by the - * predicate before it is returned by the iterator. If a filter returns - * SKIP or SKIP_TREE then the iterator code continues traversal (with the - * next node down the hierarchy or the next node that doesn't belong under the - * memcg's subtree). - */ -typedef enum mem_cgroup_filter_t -(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root); - #ifdef CONFIG_MEMCG /* * All "charge" functions with gfp_mask should use GFP_KERNEL or @@ -137,18 +120,9 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); -struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, - struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim, - mem_cgroup_iter_filter cond); - -static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, - struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim) -{ - return mem_cgroup_iter_cond(root, prev, reclaim, NULL); -} - +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, + struct mem_cgroup *, + struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); /* @@ -260,8 +234,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, mem_cgroup_update_page_stat(page, idx, -1); } -enum mem_cgroup_filter_t -mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root); void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); @@ -376,15 +349,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok) { } -static inline struct mem_cgroup * -mem_cgroup_iter_cond(struct mem_cgroup *root, - struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim, - mem_cgroup_iter_filter cond) -{ - /* first call must return non-NULL, second return NULL */ - return (struct mem_cgroup *)(unsigned long)!prev; -} static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, @@ -471,11 +435,10 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, } static inline -enum mem_cgroup_filter_t -mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { - return VISIT; + return false; } static inline void mem_cgroup_split_huge_fixup(struct page *head) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 916892c2b8e0..65e7bec4b0f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -862,15 +862,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -static enum mem_cgroup_filter_t -mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, - mem_cgroup_iter_filter cond) -{ - if (!cond) - return VISIT; - return cond(memcg, root); -} - /* * Returns a next (in a pre-order walk) alive memcg (with elevated css * ref. count) or NULL if the whole root's subtree has been visited. @@ -878,7 +869,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, * helper function to be used by mem_cgroup_iter */ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) + struct mem_cgroup *last_visited) { struct cgroup_subsys_state *prev_css, *next_css; @@ -896,31 +887,11 @@ skip_node: if (next_css) { struct mem_cgroup *mem = mem_cgroup_from_css(next_css); - switch (mem_cgroup_filter(mem, root, cond)) { - case SKIP: + if (css_tryget(&mem->css)) + return mem; + else { prev_css = next_css; goto skip_node; - case SKIP_TREE: - if (mem == root) - return NULL; - /* - * css_rightmost_descendant is not an optimal way to - * skip through a subtree (especially for imbalanced - * trees leaning to right) but that's what we have right - * now. More effective solution would be traversing - * right-up for first non-NULL without calling - * css_next_descendant_pre afterwards. - */ - prev_css = css_rightmost_descendant(next_css); - goto skip_node; - case VISIT: - if (css_tryget(&mem->css)) - return mem; - else { - prev_css = next_css; - goto skip_node; - } - break; } } @@ -984,7 +955,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks - * @cond: filter for visited nodes, NULL for no filter * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. @@ -997,18 +967,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * divide up the memcgs in the hierarchy among all concurrent * reclaimers operating on the same zone and priority. */ -struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim, - mem_cgroup_iter_filter cond) + struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - if (mem_cgroup_disabled()) { - /* first call must return non-NULL, second return NULL */ - return (struct mem_cgroup *)(unsigned long)!prev; - } + if (mem_cgroup_disabled()) + return NULL; if (!root) root = root_mem_cgroup; @@ -1019,9 +986,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) goto out_css_put; - if (mem_cgroup_filter(root, root, cond) == VISIT) - return root; - return NULL; + return root; } rcu_read_lock(); @@ -1044,7 +1009,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, last_visited = mem_cgroup_iter_load(iter, root, &seq); } - memcg = __mem_cgroup_iter_next(root, last_visited, cond); + memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { mem_cgroup_iter_update(iter, last_visited, memcg, seq); @@ -1055,11 +1020,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, reclaim->generation = iter->generation; } - /* - * We have finished the whole tree walk or no group has been - * visited because filter told us to skip the root node. - */ - if (!memcg && (prev || (cond && !last_visited))) + if (prev && !memcg) goto out_unlock; } out_unlock: @@ -1804,14 +1765,13 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -enum mem_cgroup_filter_t -mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { struct mem_cgroup *parent = memcg; if (res_counter_soft_limit_excess(&memcg->res)) - return VISIT; + return true; /* * If any parent up to the root in the hierarchy is over its soft limit @@ -1819,12 +1779,12 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, */ while ((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) - return VISIT; + return true; if (parent == root) break; } - return SKIP; + return false; } static DEFINE_SPINLOCK(memcg_oom_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index cdd300b81485..17a7134de4d7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2185,16 +2185,21 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) .zone = zone, .priority = sc->priority, }; - struct mem_cgroup *memcg = NULL; - mem_cgroup_iter_filter filter = (soft_reclaim) ? - mem_cgroup_soft_reclaim_eligible : NULL; + struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { struct lruvec *lruvec; + if (soft_reclaim && + !mem_cgroup_soft_reclaim_eligible(memcg, root)) { + memcg = mem_cgroup_iter(root, memcg, &reclaim); + continue; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2214,7 +2219,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) mem_cgroup_iter_break(root, memcg); break; } - } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, -- cgit v1.2.3-71-gd317 From b1aff7fcf86c88472b0a70f15d89d7a4adba07bb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:38 -0700 Subject: revert "vmscan, memcg: do softlimit reclaim also for targeted reclaim" Revert commit a5b7c87f9207 ("vmscan, memcg: do softlimit reclaim also for targeted reclaim") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++---- mm/memcontrol.c | 14 +++++--------- mm/vmscan.c | 4 ++-- 3 files changed, 9 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ef2b9bd7fafa..6054c9f3a5e8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -234,8 +234,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, mem_cgroup_update_page_stat(page, idx, -1); } -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, - struct mem_cgroup *root); +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg); void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, @@ -435,8 +434,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, } static inline -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, - struct mem_cgroup *root) +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) { return false; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 65e7bec4b0f0..47cdc7eb1a6b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1760,13 +1760,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) #endif /* - * A group is eligible for the soft limit reclaim under the given root - * hierarchy if - * a) it is over its soft limit + * A group is eligible for the soft limit reclaim if + * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, - struct mem_cgroup *root) +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) { struct mem_cgroup *parent = memcg; @@ -1774,14 +1772,12 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, return true; /* - * If any parent up to the root in the hierarchy is over its soft limit - * then we have to obey and reclaim from this group as well. + * If any parent up the hierarchy is over its soft limit then we + * have to obey and reclaim from this group as well. */ while ((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) return true; - if (parent == root) - break; } return false; diff --git a/mm/vmscan.c b/mm/vmscan.c index 17a7134de4d7..0e081cada4ba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -142,7 +142,7 @@ static bool global_reclaim(struct scan_control *sc) static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) { - return !mem_cgroup_disabled(); + return !mem_cgroup_disabled() && global_reclaim(sc); } #else static bool global_reclaim(struct scan_control *sc) @@ -2195,7 +2195,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) struct lruvec *lruvec; if (soft_reclaim && - !mem_cgroup_soft_reclaim_eligible(memcg, root)) { + !mem_cgroup_soft_reclaim_eligible(memcg)) { memcg = mem_cgroup_iter(root, memcg, &reclaim); continue; } -- cgit v1.2.3-71-gd317 From 0608f43da64a1f1c42507304b5f25bc8b1227aa4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:41 -0700 Subject: revert "memcg, vmscan: integrate soft reclaim tighter with zone shrinking code" Revert commit 3b38722efd9f ("memcg, vmscan: integrate soft reclaim tighter with zone shrinking code") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++- mm/memcontrol.c | 163 +++++++++++++++++++++++++++++++++++++++------ mm/vmscan.c | 62 ++++++++--------- 3 files changed, 175 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6054c9f3a5e8..ecc82b37c4cc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -234,7 +234,9 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, mem_cgroup_update_page_stat(page, idx, -1); } -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg); +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, + unsigned long *total_scanned); void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, @@ -434,9 +436,11 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, } static inline -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) { - return false; + return 0; } static inline void mem_cgroup_split_huge_fixup(struct page *head) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 852dbec07ce6..1c52ddbc839b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1991,28 +1991,57 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) } #endif -/* - * A group is eligible for the soft limit reclaim if - * a) it is over its soft limit - * b) any parent up the hierarchy is over its soft limit - */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) -{ - struct mem_cgroup *parent = memcg; - - if (res_counter_soft_limit_excess(&memcg->res)) - return true; - - /* - * If any parent up the hierarchy is over its soft limit then we - * have to obey and reclaim from this group as well. - */ - while ((parent = parent_mem_cgroup(parent))) { - if (res_counter_soft_limit_excess(&parent->res)) - return true; +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; + int loop = 0; + unsigned long excess; + unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; + + excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; + + while (1) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { + loop++; + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!total) + break; + /* + * We want to do more targeted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) + break; + } + continue; + } + if (!mem_cgroup_reclaimable(victim, false)) + continue; + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!res_counter_soft_limit_excess(&root_memcg->res)) + break; } - - return false; + mem_cgroup_iter_break(root_memcg, victim); + return total; } static DEFINE_SPINLOCK(memcg_oom_lock); @@ -4761,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_zone *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_zone *mctz; + unsigned long long excess; + unsigned long nr_scanned; + + if (order > 0) + return 0; + + mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + nr_scanned = 0; + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, + gfp_mask, &nr_scanned); + nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; + spin_lock(&mctz->lock); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) { + do { + /* + * Loop until we find yet another one. + * + * By the time we get the soft_limit lock + * again, someone might have aded the + * group back on the RB tree. Iterate to + * make sure we get a different mem. + * mem_cgroup_largest_soft_limit_node returns + * NULL if no other cgroup is present on + * the tree + */ + next_mz = + __mem_cgroup_largest_soft_limit_node(mctz); + if (next_mz == mz) + css_put(&next_mz->memcg->css); + else /* next_mz == NULL or other memcg */ + break; + } while (1); + } + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->memcg->res); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); + spin_unlock(&mctz->lock); + css_put(&mz->memcg->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->memcg->css); + return nr_reclaimed; +} + /** * mem_cgroup_force_empty_list - clears LRU of a group * @memcg: group to clear diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e081cada4ba..beb35778c69f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -139,21 +139,11 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } - -static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) -{ - return !mem_cgroup_disabled() && global_reclaim(sc); -} #else static bool global_reclaim(struct scan_control *sc) { return true; } - -static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) -{ - return false; -} #endif unsigned long zone_reclaimable_pages(struct zone *zone) @@ -2174,8 +2164,7 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void -__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) +static void shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; @@ -2194,12 +2183,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) do { struct lruvec *lruvec; - if (soft_reclaim && - !mem_cgroup_soft_reclaim_eligible(memcg)) { - memcg = mem_cgroup_iter(root, memcg, &reclaim); - continue; - } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2230,24 +2213,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) sc->nr_scanned - nr_scanned, sc)); } - -static void shrink_zone(struct zone *zone, struct scan_control *sc) -{ - bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); - unsigned long nr_scanned = sc->nr_scanned; - - __shrink_zone(zone, sc, do_soft_reclaim); - - /* - * No group is over the soft limit or those that are do not have - * pages in the zone we are reclaiming so we have to reclaim everybody - */ - if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { - __shrink_zone(zone, sc, false); - return; - } -} - /* Returns true if compaction should go ahead for a high-order request */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { @@ -2309,6 +2274,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; bool aborted_reclaim = false; /* @@ -2348,6 +2315,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; } } + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. + */ + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ } @@ -2941,6 +2920,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .priority = DEF_PRIORITY, @@ -3055,6 +3036,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, sc.nr_scanned = 0; + nr_soft_scanned = 0; + /* + * Call soft limit reclaim before calling shrink_zone. + */ + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + /* * There should be no need to raise the scanning * priority if enough pages are already being scanned -- cgit v1.2.3-71-gd317 From b0b8c960ffcc5bfc82d1a09ad931673e3a36c15a Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 4 Sep 2013 10:52:57 -0500 Subject: of: clean-up ifdefs in of_irq.h Much of of_irq.h is needlessly ifdef'ed. Clean this up and minimize the amount ifdef'ed code. This fixes some build warnings when CONFIG_OF is not enabled (seen on i386 and x86_64): include/linux/of_irq.h:82:7: warning: 'struct device_node' declared inside parameter list [enabled by default] include/linux/of_irq.h:82:7: warning: its scope is only this definition or declaration, which is probably not what you want [enabled by default] include/linux/of_irq.h:87:47: warning: 'struct device_node' declared inside parameter list [enabled by default] Compile tested on i386, sparc and arm. Reported-by: Randy Dunlap Cc: Grant Likely Signed-off-by: Rob Herring --- include/linux/of_irq.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 535cecf1e02f..fcd63baee5f2 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -1,8 +1,6 @@ #ifndef __OF_IRQ_H #define __OF_IRQ_H -#if defined(CONFIG_OF) -struct of_irq; #include #include #include @@ -10,14 +8,6 @@ struct of_irq; #include #include -/* - * irq_of_parse_and_map() is used by all OF enabled platforms; but SPARC - * implements it differently. However, the prototype is the same for all, - * so declare it here regardless of the CONFIG_OF_IRQ setting. - */ -extern unsigned int irq_of_parse_and_map(struct device_node *node, int index); - -#if defined(CONFIG_OF_IRQ) /** * of_irq - container for device_node/irq_specifier pair for an irq controller * @controller: pointer to interrupt controller device tree node @@ -71,11 +61,17 @@ extern int of_irq_to_resource(struct device_node *dev, int index, extern int of_irq_count(struct device_node *dev); extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); -extern struct device_node *of_irq_find_parent(struct device_node *child); extern void of_irq_init(const struct of_device_id *matches); -#endif /* CONFIG_OF_IRQ */ +#if defined(CONFIG_OF) +/* + * irq_of_parse_and_map() is used by all OF enabled platforms; but SPARC + * implements it differently. However, the prototype is the same for all, + * so declare it here regardless of the CONFIG_OF_IRQ setting. + */ +extern unsigned int irq_of_parse_and_map(struct device_node *node, int index); +extern struct device_node *of_irq_find_parent(struct device_node *child); #else /* !CONFIG_OF */ static inline unsigned int irq_of_parse_and_map(struct device_node *dev, -- cgit v1.2.3-71-gd317 From 5bc2afc2b53fc73f154e6344cd898585628e6d27 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 23 Sep 2013 18:01:28 -0400 Subject: NFSv4: Honour the 'opened' parameter in the atomic_open() filesystem method Determine if we've created a new file by examining the directory change attribute and/or the O_EXCL flag. This fixes a regression when doing a non-exclusive create of a new file. If the FILE_CREATED flag is not set, the atomic_open() command will perform full file access permissions checks instead of just checking for MAY_OPEN. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/nfs4file.c | 3 ++- fs/nfs/nfs4proc.c | 26 +++++++++++++++++++------- include/linux/nfs_xdr.h | 3 ++- 4 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 854a8f05a610..02b0df769e2d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1458,7 +1458,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, trace_nfs_atomic_open_enter(dir, ctx, open_flags); nfs_block_sillyrename(dentry->d_parent); - inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); + inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened); nfs_unblock_sillyrename(dentry->d_parent); if (IS_ERR(inode)) { err = PTR_ERR(inode); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e5b804dd944c..77efaf15ec90 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -19,6 +19,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct inode *dir; unsigned openflags = filp->f_flags; struct iattr attr; + int opened = 0; int err; /* @@ -55,7 +56,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_wb_all(inode); } - inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr); + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); if (IS_ERR(inode)) { err = PTR_ERR(inode); switch (err) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 989bb9d3074d..488ef9b5c51a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -912,6 +912,7 @@ struct nfs4_opendata { struct iattr attrs; unsigned long timestamp; unsigned int rpc_done : 1; + unsigned int file_created : 1; unsigned int is_recover : 1; int rpc_status; int cancelled; @@ -1946,8 +1947,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) nfs_fattr_map_and_free_names(server, &data->f_attr); - if (o_arg->open_flags & O_CREAT) + if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); + if (o_arg->open_flags & O_EXCL) + data->file_created = 1; + else if (o_res->cinfo.before != o_res->cinfo.after) + data->file_created = 1; + } if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) server->caps &= ~NFS_CAP_POSIX_LOCK; if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { @@ -2191,7 +2197,8 @@ static int _nfs4_do_open(struct inode *dir, struct nfs_open_context *ctx, int flags, struct iattr *sattr, - struct nfs4_label *label) + struct nfs4_label *label, + int *opened) { struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; @@ -2261,6 +2268,8 @@ static int _nfs4_do_open(struct inode *dir, nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } } + if (opendata->file_created) + *opened |= FILE_CREATED; if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) *ctx_th = opendata->f_attr.mdsthreshold; @@ -2289,7 +2298,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct nfs_open_context *ctx, int flags, struct iattr *sattr, - struct nfs4_label *label) + struct nfs4_label *label, + int *opened) { struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; @@ -2297,7 +2307,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, int status; do { - status = _nfs4_do_open(dir, ctx, flags, sattr, label); + status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened); res = ctx->state; trace_nfs4_open_file(ctx, flags, status); if (status == 0) @@ -2659,7 +2669,8 @@ out: } static struct inode * -nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) +nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, + int open_flags, struct iattr *attr, int *opened) { struct nfs4_state *state; struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; @@ -2667,7 +2678,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags label = nfs4_label_init_security(dir, ctx->dentry, attr, &l); /* Protect against concurrent sillydeletes */ - state = nfs4_do_open(dir, ctx, open_flags, attr, label); + state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened); nfs4_label_release_security(label); @@ -3332,6 +3343,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct nfs4_label l, *ilabel = NULL; struct nfs_open_context *ctx; struct nfs4_state *state; + int opened = 0; int status = 0; ctx = alloc_nfs_open_context(dentry, FMODE_READ); @@ -3341,7 +3353,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, ctx, flags, sattr, ilabel); + state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened); if (IS_ERR(state)) { status = PTR_ERR(state); goto out; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 01fd84b566f7..49f52c8f4422 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1455,7 +1455,8 @@ struct nfs_rpc_ops { struct inode * (*open_context) (struct inode *dir, struct nfs_open_context *ctx, int open_flags, - struct iattr *iattr); + struct iattr *iattr, + int *); int (*have_delegation)(struct inode *, fmode_t); int (*return_delegation)(struct inode *); struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); -- cgit v1.2.3-71-gd317 From 2bedea8f26c92e2610f2f67889144990749461e0 Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Wed, 25 Sep 2013 12:11:02 +0200 Subject: bcma: make bcma_core_pci_{up,down}() callable from atomic context This patch removes the bcma_core_pci_power_save() call from the bcma_core_pci_{up,down}() functions as it tries to schedule thus requiring to call them from non-atomic context. The function bcma_core_pci_power_save() is now exported so the calling module can explicitly use it in non-atomic context. This fixes the 'scheduling while atomic' issue reported by Tod Jackson and Joe Perches. [ 13.210710] BUG: scheduling while atomic: dhcpcd/1800/0x00000202 [ 13.210718] Modules linked in: brcmsmac nouveau coretemp kvm_intel kvm cordic brcmutil bcma dell_wmi atl1c ttm mxm_wmi wmi [ 13.210756] CPU: 2 PID: 1800 Comm: dhcpcd Not tainted 3.11.0-wl #1 [ 13.210762] Hardware name: Alienware M11x R2/M11x R2, BIOS A04 11/23/2010 [ 13.210767] ffff880177c92c40 ffff880170fd1948 ffffffff8169af5b 0000000000000007 [ 13.210777] ffff880170fd1ab0 ffff880170fd1958 ffffffff81697ee2 ffff880170fd19d8 [ 13.210785] ffffffff816a19f5 00000000000f4240 000000000000d080 ffff880170fd1fd8 [ 13.210794] Call Trace: [ 13.210813] [] dump_stack+0x4f/0x84 [ 13.210826] [] __schedule_bug+0x43/0x51 [ 13.210837] [] __schedule+0x6e5/0x810 [ 13.210845] [] schedule+0x24/0x70 [ 13.210855] [] schedule_hrtimeout_range_clock+0x10c/0x150 [ 13.210867] [] ? update_rmtp+0x60/0x60 [ 13.210877] [] ? hrtimer_start_range_ns+0xf/0x20 [ 13.210887] [] schedule_hrtimeout_range+0xe/0x10 [ 13.210897] [] usleep_range+0x3b/0x40 [ 13.210910] [] bcma_pcie_mdio_set_phy.isra.3+0x4f/0x80 [bcma] [ 13.210921] [] bcma_pcie_mdio_write.isra.4+0xbf/0xd0 [bcma] [ 13.210932] [] bcma_pcie_mdio_writeread.isra.6.constprop.13+0x18/0x30 [bcma] [ 13.210942] [] bcma_core_pci_power_save+0x3e/0x80 [bcma] [ 13.210953] [] bcma_core_pci_up+0x2d/0x60 [bcma] [ 13.210975] [] brcms_c_up+0xfc/0x430 [brcmsmac] [ 13.210989] [] brcms_up+0x1d/0x20 [brcmsmac] [ 13.211003] [] brcms_ops_start+0x298/0x340 [brcmsmac] [ 13.211020] [] ? cfg80211_netdev_notifier_call+0xd2/0x5f0 [ 13.211030] [] ? packet_notifier+0xad/0x1d0 [ 13.211064] [] ieee80211_do_open+0x325/0xf80 [ 13.211076] [] ? __raw_notifier_call_chain+0x9/0x10 [ 13.211086] [] ieee80211_open+0x71/0x80 [ 13.211101] [] __dev_open+0x87/0xe0 [ 13.211109] [] __dev_change_flags+0x9c/0x180 [ 13.211117] [] dev_change_flags+0x23/0x70 [ 13.211127] [] devinet_ioctl+0x5b8/0x6a0 [ 13.211136] [] inet_ioctl+0x75/0x90 [ 13.211147] [] sock_do_ioctl+0x2b/0x70 [ 13.211155] [] sock_ioctl+0x71/0x2a0 [ 13.211169] [] do_vfs_ioctl+0x87/0x520 [ 13.211180] [] ? ____fput+0x9/0x10 [ 13.211198] [] ? task_work_run+0x9c/0xd0 [ 13.211202] [] SyS_ioctl+0x91/0xb0 [ 13.211208] [] system_call_fastpath+0x16/0x1b [ 13.211217] NOHZ: local_softirq_pending 202 The issue was introduced in v3.11 kernel by following commit: commit aa51e598d04c6acf5477934cd6383f5a17ce9029 Author: Hauke Mehrtens Date: Sat Aug 24 00:32:31 2013 +0200 brcmsmac: use bcma PCIe up and down functions replace the calls to bcma_core_pci_extend_L1timer() by calls to the newly introduced bcma_core_pci_ip() and bcma_core_pci_down() Signed-off-by: Hauke Mehrtens Cc: Arend van Spriel Signed-off-by: John W. Linville This fix has been discussed with Hauke Mehrtens [1] selection option 3) and is intended for v3.12. Ref: [1] http://mid.gmane.org/5239B12D.3040206@hauke-m.de Cc: # 3.11.x Cc: Tod Jackson Cc: Joe Perches Cc: Rafal Milecki Cc: Hauke Mehrtens Reviewed-by: Hante Meuleman Signed-off-by: Arend van Spriel Signed-off-by: John W. Linville --- drivers/bcma/driver_pci.c | 49 +++++++++++++++++++----------------- include/linux/bcma/bcma_driver_pci.h | 1 + 2 files changed, 27 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/bcma/driver_pci.c b/drivers/bcma/driver_pci.c index c9fd6943ce45..50329d1057ed 100644 --- a/drivers/bcma/driver_pci.c +++ b/drivers/bcma/driver_pci.c @@ -210,25 +210,6 @@ static void bcma_core_pci_config_fixup(struct bcma_drv_pci *pc) } } -static void bcma_core_pci_power_save(struct bcma_drv_pci *pc, bool up) -{ - u16 data; - - if (pc->core->id.rev >= 15 && pc->core->id.rev <= 20) { - data = up ? 0x74 : 0x7C; - bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, - BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7F64); - bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, - BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data); - } else if (pc->core->id.rev >= 21 && pc->core->id.rev <= 22) { - data = up ? 0x75 : 0x7D; - bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, - BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7E65); - bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, - BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data); - } -} - /************************************************** * Init. **************************************************/ @@ -255,6 +236,32 @@ void bcma_core_pci_init(struct bcma_drv_pci *pc) bcma_core_pci_clientmode_init(pc); } +void bcma_core_pci_power_save(struct bcma_bus *bus, bool up) +{ + struct bcma_drv_pci *pc; + u16 data; + + if (bus->hosttype != BCMA_HOSTTYPE_PCI) + return; + + pc = &bus->drv_pci[0]; + + if (pc->core->id.rev >= 15 && pc->core->id.rev <= 20) { + data = up ? 0x74 : 0x7C; + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7F64); + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data); + } else if (pc->core->id.rev >= 21 && pc->core->id.rev <= 22) { + data = up ? 0x75 : 0x7D; + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7E65); + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data); + } +} +EXPORT_SYMBOL_GPL(bcma_core_pci_power_save); + int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc, struct bcma_device *core, bool enable) { @@ -310,8 +317,6 @@ void bcma_core_pci_up(struct bcma_bus *bus) pc = &bus->drv_pci[0]; - bcma_core_pci_power_save(pc, true); - bcma_core_pci_extend_L1timer(pc, true); } EXPORT_SYMBOL_GPL(bcma_core_pci_up); @@ -326,7 +331,5 @@ void bcma_core_pci_down(struct bcma_bus *bus) pc = &bus->drv_pci[0]; bcma_core_pci_extend_L1timer(pc, false); - - bcma_core_pci_power_save(pc, false); } EXPORT_SYMBOL_GPL(bcma_core_pci_down); diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h index d66033f418c9..0333e605ea0d 100644 --- a/include/linux/bcma/bcma_driver_pci.h +++ b/include/linux/bcma/bcma_driver_pci.h @@ -242,6 +242,7 @@ extern int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc, struct bcma_device *core, bool enable); extern void bcma_core_pci_up(struct bcma_bus *bus); extern void bcma_core_pci_down(struct bcma_bus *bus); +extern void bcma_core_pci_power_save(struct bcma_bus *bus, bool up); extern int bcma_core_pci_pcibios_map_irq(const struct pci_dev *dev); extern int bcma_core_pci_plat_dev_init(struct pci_dev *dev); -- cgit v1.2.3-71-gd317 From 3a4916050ba2e0f1d114ef540abdf02b2b173e61 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Fri, 6 Sep 2013 11:49:56 -0700 Subject: Drivers: hv: util: Correctly support ws2008R2 and earlier The current code does not correctly negotiate the version numbers for the util driver when hosted on earlier hosts. The version numbers presented by this driver were not compatible with the version numbers supported by Windows Server 2008. Fix this problem. I would like to thank Olaf Hering (ohering@suse.com) for identifying the problem. Reported-by: Olaf Hering Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_kvp.c | 38 ++++++++++++++++++-------- drivers/hv/hv_snapshot.c | 6 ++-- drivers/hv/hv_util.c | 71 ++++++++++++++++++++++++++++++++++++------------ include/linux/hyperv.h | 7 +++-- 4 files changed, 88 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index 28b03325b872..09988b289622 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -32,13 +32,17 @@ /* * Pre win8 version numbers used in ws2008 and ws 2008 r2 (win7) */ +#define WS2008_SRV_MAJOR 1 +#define WS2008_SRV_MINOR 0 +#define WS2008_SRV_VERSION (WS2008_SRV_MAJOR << 16 | WS2008_SRV_MINOR) + #define WIN7_SRV_MAJOR 3 #define WIN7_SRV_MINOR 0 -#define WIN7_SRV_MAJOR_MINOR (WIN7_SRV_MAJOR << 16 | WIN7_SRV_MINOR) +#define WIN7_SRV_VERSION (WIN7_SRV_MAJOR << 16 | WIN7_SRV_MINOR) #define WIN8_SRV_MAJOR 4 #define WIN8_SRV_MINOR 0 -#define WIN8_SRV_MAJOR_MINOR (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) +#define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) /* * Global state maintained for transaction that is being processed. @@ -587,6 +591,8 @@ void hv_kvp_onchannelcallback(void *context) struct icmsg_hdr *icmsghdrp; struct icmsg_negotiate *negop = NULL; + int util_fw_version; + int kvp_srv_version; if (kvp_transaction.active) { /* @@ -606,17 +612,26 @@ void hv_kvp_onchannelcallback(void *context) if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { /* - * We start with win8 version and if the host cannot - * support that we use the previous version. + * Based on the host, select appropriate + * framework and service versions we will + * negotiate. */ - if (vmbus_prep_negotiate_resp(icmsghdrp, negop, - recv_buffer, UTIL_FW_MAJOR_MINOR, - WIN8_SRV_MAJOR_MINOR)) - goto done; - + switch (vmbus_proto_version) { + case (VERSION_WS2008): + util_fw_version = UTIL_WS2K8_FW_VERSION; + kvp_srv_version = WS2008_SRV_VERSION; + break; + case (VERSION_WIN7): + util_fw_version = UTIL_FW_VERSION; + kvp_srv_version = WIN7_SRV_VERSION; + break; + default: + util_fw_version = UTIL_FW_VERSION; + kvp_srv_version = WIN8_SRV_VERSION; + } vmbus_prep_negotiate_resp(icmsghdrp, negop, - recv_buffer, UTIL_FW_MAJOR_MINOR, - WIN7_SRV_MAJOR_MINOR); + recv_buffer, util_fw_version, + kvp_srv_version); } else { kvp_msg = (struct hv_kvp_msg *)&recv_buffer[ @@ -649,7 +664,6 @@ void hv_kvp_onchannelcallback(void *context) return; } -done: icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index e4572f3f2834..0c3546224376 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -26,7 +26,7 @@ #define VSS_MAJOR 5 #define VSS_MINOR 0 -#define VSS_MAJOR_MINOR (VSS_MAJOR << 16 | VSS_MINOR) +#define VSS_VERSION (VSS_MAJOR << 16 | VSS_MINOR) @@ -190,8 +190,8 @@ void hv_vss_onchannelcallback(void *context) if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { vmbus_prep_negotiate_resp(icmsghdrp, negop, - recv_buffer, UTIL_FW_MAJOR_MINOR, - VSS_MAJOR_MINOR); + recv_buffer, UTIL_FW_VERSION, + VSS_VERSION); } else { vss_msg = (struct hv_vss_msg *)&recv_buffer[ sizeof(struct vmbuspipe_hdr) + diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index cb82233541b1..273e3ddb3a20 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -28,17 +28,32 @@ #include #include -#define SHUTDOWN_MAJOR 3 -#define SHUTDOWN_MINOR 0 -#define SHUTDOWN_MAJOR_MINOR (SHUTDOWN_MAJOR << 16 | SHUTDOWN_MINOR) -#define TIMESYNCH_MAJOR 3 -#define TIMESYNCH_MINOR 0 -#define TIMESYNCH_MAJOR_MINOR (TIMESYNCH_MAJOR << 16 | TIMESYNCH_MINOR) +#define SD_MAJOR 3 +#define SD_MINOR 0 +#define SD_VERSION (SD_MAJOR << 16 | SD_MINOR) -#define HEARTBEAT_MAJOR 3 -#define HEARTBEAT_MINOR 0 -#define HEARTBEAT_MAJOR_MINOR (HEARTBEAT_MAJOR << 16 | HEARTBEAT_MINOR) +#define SD_WS2008_MAJOR 1 +#define SD_WS2008_VERSION (SD_WS2008_MAJOR << 16 | SD_MINOR) + +#define TS_MAJOR 3 +#define TS_MINOR 0 +#define TS_VERSION (TS_MAJOR << 16 | TS_MINOR) + +#define TS_WS2008_MAJOR 1 +#define TS_WS2008_VERSION (TS_WS2008_MAJOR << 16 | TS_MINOR) + +#define HB_MAJOR 3 +#define HB_MINOR 0 +#define HB_VERSION (HB_MAJOR << 16 | HB_MINOR) + +#define HB_WS2008_MAJOR 1 +#define HB_WS2008_VERSION (HB_WS2008_MAJOR << 16 | HB_MINOR) + +static int sd_srv_version; +static int ts_srv_version; +static int hb_srv_version; +static int util_fw_version; static void shutdown_onchannelcallback(void *context); static struct hv_util_service util_shutdown = { @@ -99,8 +114,8 @@ static void shutdown_onchannelcallback(void *context) if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { vmbus_prep_negotiate_resp(icmsghdrp, negop, - shut_txf_buf, UTIL_FW_MAJOR_MINOR, - SHUTDOWN_MAJOR_MINOR); + shut_txf_buf, util_fw_version, + sd_srv_version); } else { shutdown_msg = (struct shutdown_msg_data *)&shut_txf_buf[ @@ -216,6 +231,7 @@ static void timesync_onchannelcallback(void *context) struct icmsg_hdr *icmsghdrp; struct ictimesync_data *timedatap; u8 *time_txf_buf = util_timesynch.recv_buffer; + struct icmsg_negotiate *negop = NULL; vmbus_recvpacket(channel, time_txf_buf, PAGE_SIZE, &recvlen, &requestid); @@ -225,9 +241,10 @@ static void timesync_onchannelcallback(void *context) sizeof(struct vmbuspipe_hdr)]; if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { - vmbus_prep_negotiate_resp(icmsghdrp, NULL, time_txf_buf, - UTIL_FW_MAJOR_MINOR, - TIMESYNCH_MAJOR_MINOR); + vmbus_prep_negotiate_resp(icmsghdrp, negop, + time_txf_buf, + util_fw_version, + ts_srv_version); } else { timedatap = (struct ictimesync_data *)&time_txf_buf[ sizeof(struct vmbuspipe_hdr) + @@ -257,6 +274,7 @@ static void heartbeat_onchannelcallback(void *context) struct icmsg_hdr *icmsghdrp; struct heartbeat_msg_data *heartbeat_msg; u8 *hbeat_txf_buf = util_heartbeat.recv_buffer; + struct icmsg_negotiate *negop = NULL; vmbus_recvpacket(channel, hbeat_txf_buf, PAGE_SIZE, &recvlen, &requestid); @@ -266,9 +284,9 @@ static void heartbeat_onchannelcallback(void *context) sizeof(struct vmbuspipe_hdr)]; if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { - vmbus_prep_negotiate_resp(icmsghdrp, NULL, - hbeat_txf_buf, UTIL_FW_MAJOR_MINOR, - HEARTBEAT_MAJOR_MINOR); + vmbus_prep_negotiate_resp(icmsghdrp, negop, + hbeat_txf_buf, util_fw_version, + hb_srv_version); } else { heartbeat_msg = (struct heartbeat_msg_data *)&hbeat_txf_buf[ @@ -321,6 +339,25 @@ static int util_probe(struct hv_device *dev, goto error; hv_set_drvdata(dev, srv); + /* + * Based on the host; initialize the framework and + * service version numbers we will negotiate. + */ + switch (vmbus_proto_version) { + case (VERSION_WS2008): + util_fw_version = UTIL_WS2K8_FW_VERSION; + sd_srv_version = SD_WS2008_VERSION; + ts_srv_version = TS_WS2008_VERSION; + hb_srv_version = HB_WS2008_VERSION; + break; + + default: + util_fw_version = UTIL_FW_VERSION; + sd_srv_version = SD_VERSION; + ts_srv_version = TS_VERSION; + hb_srv_version = HB_VERSION; + } + return 0; error: diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index a3b8b2e2d244..d98503bde7e9 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -30,10 +30,13 @@ /* * Framework version for util services. */ +#define UTIL_FW_MINOR 0 + +#define UTIL_WS2K8_FW_MAJOR 1 +#define UTIL_WS2K8_FW_VERSION (UTIL_WS2K8_FW_MAJOR << 16 | UTIL_FW_MINOR) #define UTIL_FW_MAJOR 3 -#define UTIL_FW_MINOR 0 -#define UTIL_FW_MAJOR_MINOR (UTIL_FW_MAJOR << 16 | UTIL_FW_MINOR) +#define UTIL_FW_VERSION (UTIL_FW_MAJOR << 16 | UTIL_FW_MINOR) /* -- cgit v1.2.3-71-gd317 From 083986e8248d978b6c961d3da6beb0c921c68220 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 28 Sep 2013 11:23:59 +0200 Subject: mutex: replace CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX with simple ifdef Linus suggested to replace #ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX #define arch_mutex_cpu_relax() cpu_relax() #endif with just a simple #ifndef arch_mutex_cpu_relax # define arch_mutex_cpu_relax() cpu_relax() #endif to get rid of CONFIG_HAVE_CPU_RELAX_SIMPLE. So architectures can simply define arch_mutex_cpu_relax if they want an architecture specific function instead of having to add a select statement in their Kconfig in addition. Suggested-by: Linus Torvalds Signed-off-by: Heiko Carstens --- arch/Kconfig | 3 --- arch/s390/Kconfig | 1 - arch/s390/include/asm/mutex.h | 2 -- arch/s390/include/asm/processor.h | 2 ++ include/linux/mutex.h | 6 +++--- 5 files changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/arch/Kconfig b/arch/Kconfig index 1feb169274fe..af2cc6eabcc7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -286,9 +286,6 @@ config HAVE_PERF_USER_STACK_DUMP config HAVE_ARCH_JUMP_LABEL bool -config HAVE_ARCH_MUTEX_CPU_RELAX - bool - config HAVE_RCU_TABLE_FREE bool diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index dcc6ac2d8026..d3fa84070b82 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -102,7 +102,6 @@ config S390 select GENERIC_TIME_VSYSCALL_OLD select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_JUMP_LABEL if !MARCH_G5 - select HAVE_ARCH_MUTEX_CPU_RELAX select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h index 688271f5f2e4..458c1f7fbc18 100644 --- a/arch/s390/include/asm/mutex.h +++ b/arch/s390/include/asm/mutex.h @@ -7,5 +7,3 @@ */ #include - -#define arch_mutex_cpu_relax() barrier() diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 0eb37505cab1..ca7821f07260 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -198,6 +198,8 @@ static inline void cpu_relax(void) barrier(); } +#define arch_mutex_cpu_relax() barrier() + static inline void psw_set_key(unsigned int key) { asm volatile("spka 0(%0)" : : "d" (key)); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index ccd4260834c5..bab49da8a0f0 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -15,8 +15,8 @@ #include #include #include - #include +#include /* * Simple, straightforward mutexes with strict semantics: @@ -175,8 +175,8 @@ extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); -#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX -#define arch_mutex_cpu_relax() cpu_relax() +#ifndef arch_mutex_cpu_relax +# define arch_mutex_cpu_relax() cpu_relax() #endif #endif -- cgit v1.2.3-71-gd317 From 60e453a940ac678565b6641d65f8c18541bb9f28 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 23 Sep 2013 20:59:35 +0800 Subject: USBNET: fix handling padding packet Commit 638c5115a7949(USBNET: support DMA SG) introduces DMA SG if the usb host controller is capable of building packet from discontinuous buffers, but missed handling padding packet when building DMA SG. This patch attachs the pre-allocated padding packet at the end of the sg list, so padding packet can be sent to device if drivers require that. Reported-by: David Laight Acked-by: Oliver Neukum Signed-off-by: Ming Lei Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 27 +++++++++++++++++++++------ include/linux/usb/usbnet.h | 1 + 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 7b331e613e02..bf94e10a37c8 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1241,7 +1241,9 @@ static int build_dma_sg(const struct sk_buff *skb, struct urb *urb) if (num_sgs == 1) return 0; - urb->sg = kmalloc(num_sgs * sizeof(struct scatterlist), GFP_ATOMIC); + /* reserve one for zero packet */ + urb->sg = kmalloc((num_sgs + 1) * sizeof(struct scatterlist), + GFP_ATOMIC); if (!urb->sg) return -ENOMEM; @@ -1305,7 +1307,7 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb, if (build_dma_sg(skb, urb) < 0) goto drop; } - entry->length = length = urb->transfer_buffer_length; + length = urb->transfer_buffer_length; /* don't assume the hardware handles USB_ZERO_PACKET * NOTE: strictly conforming cdc-ether devices should expect @@ -1317,15 +1319,18 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb, if (length % dev->maxpacket == 0) { if (!(info->flags & FLAG_SEND_ZLP)) { if (!(info->flags & FLAG_MULTI_PACKET)) { - urb->transfer_buffer_length++; - if (skb_tailroom(skb)) { + length++; + if (skb_tailroom(skb) && !urb->num_sgs) { skb->data[skb->len] = 0; __skb_put(skb, 1); - } + } else if (urb->num_sgs) + sg_set_buf(&urb->sg[urb->num_sgs++], + dev->padding_pkt, 1); } } else urb->transfer_flags |= URB_ZERO_PACKET; } + entry->length = urb->transfer_buffer_length = length; spin_lock_irqsave(&dev->txq.lock, flags); retval = usb_autopm_get_interface_async(dev->intf); @@ -1509,6 +1514,7 @@ void usbnet_disconnect (struct usb_interface *intf) usb_kill_urb(dev->interrupt); usb_free_urb(dev->interrupt); + kfree(dev->padding_pkt); free_netdev(net); } @@ -1679,9 +1685,16 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) /* initialize max rx_qlen and tx_qlen */ usbnet_update_max_qlen(dev); + if (dev->can_dma_sg && !(info->flags & FLAG_SEND_ZLP) && + !(info->flags & FLAG_MULTI_PACKET)) { + dev->padding_pkt = kzalloc(1, GFP_KERNEL); + if (!dev->padding_pkt) + goto out4; + } + status = register_netdev (net); if (status) - goto out4; + goto out5; netif_info(dev, probe, dev->net, "register '%s' at usb-%s-%s, %s, %pM\n", udev->dev.driver->name, @@ -1699,6 +1712,8 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) return 0; +out5: + kfree(dev->padding_pkt); out4: usb_free_urb(dev->interrupt); out3: diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 9cb2fe8ca944..e303eef94dd5 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -42,6 +42,7 @@ struct usbnet { struct usb_host_endpoint *status; unsigned maxpacket; struct timer_list delay; + const char *padding_pkt; /* protocol/interface state */ struct net_device *net; -- cgit v1.2.3-71-gd317 From 117aad1e9e4d97448d1df3f84b08bd65811e6d6a Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Mon, 30 Sep 2013 13:45:16 -0700 Subject: mm: avoid reinserting isolated balloon pages into LRU lists Isolated balloon pages can wrongly end up in LRU lists when migrate_pages() finishes its round without draining all the isolated page list. The same issue can happen when reclaim_clean_pages_from_list() tries to reclaim pages from an isolated page list, before migration, in the CMA path. Such balloon page leak opens a race window against LRU lists shrinkers that leads us to the following kernel panic: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [] shrink_page_list+0x24e/0x897 PGD 3cda2067 PUD 3d713067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 340 Comm: kswapd0 Not tainted 3.12.0-rc1-22626-g4367597 #87 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: shrink_page_list+0x24e/0x897 RSP: 0000:ffff88003da499b8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88003e82bd60 RCX: 00000000000657d5 RDX: 0000000000000000 RSI: 000000000000031f RDI: ffff88003e82bd40 RBP: ffff88003da49ab0 R08: 0000000000000001 R09: 0000000081121a45 R10: ffffffff81121a45 R11: ffff88003c4a9a28 R12: ffff88003e82bd40 R13: ffff88003da0e800 R14: 0000000000000001 R15: ffff88003da49d58 FS: 0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000067d9000 CR3: 000000003ace5000 CR4: 00000000000407b0 Call Trace: shrink_inactive_list+0x240/0x3de shrink_lruvec+0x3e0/0x566 __shrink_zone+0x94/0x178 shrink_zone+0x3a/0x82 balance_pgdat+0x32a/0x4c2 kswapd+0x2f0/0x372 kthread+0xa2/0xaa ret_from_fork+0x7c/0xb0 Code: 80 7d 8f 01 48 83 95 68 ff ff ff 00 4c 89 e7 e8 5a 7b 00 00 48 85 c0 49 89 c5 75 08 80 7d 8f 00 74 3e eb 31 48 8b 80 18 01 00 00 <48> 8b 74 0d 48 8b 78 30 be 02 00 00 00 ff d2 eb RIP [] shrink_page_list+0x24e/0x897 RSP CR2: 0000000000000028 ---[ end trace 703d2451af6ffbfd ]--- Kernel panic - not syncing: Fatal exception This patch fixes the issue, by assuring the proper tests are made at putback_movable_pages() & reclaim_clean_pages_from_list() to avoid isolated balloon pages being wrongly reinserted in LRU lists. [akpm@linux-foundation.org: clarify awkward comment text] Signed-off-by: Rafael Aquini Reported-by: Luiz Capitulino Tested-by: Luiz Capitulino Cc: Mel Gorman Cc: Rik van Riel Cc: Hugh Dickins Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/balloon_compaction.h | 25 +++++++++++++++++++++++++ mm/migrate.c | 2 +- mm/vmscan.c | 4 +++- 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index f7f1d7169b11..089743ade734 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -158,6 +158,26 @@ static inline bool balloon_page_movable(struct page *page) return false; } +/* + * isolated_balloon_page - identify an isolated balloon page on private + * compaction/migration page lists. + * + * After a compaction thread isolates a balloon page for migration, it raises + * the page refcount to prevent concurrent compaction threads from re-isolating + * the same page. For that reason putback_movable_pages(), or other routines + * that need to identify isolated balloon pages on private pagelists, cannot + * rely on balloon_page_movable() to accomplish the task. + */ +static inline bool isolated_balloon_page(struct page *page) +{ + /* Already isolated balloon pages, by default, have a raised refcount */ + if (page_flags_cleared(page) && !page_mapped(page) && + page_count(page) >= 2) + return __is_movable_balloon_page(page); + + return false; +} + /* * balloon_page_insert - insert a page into the balloon's page list and make * the page->mapping assignment accordingly. @@ -243,6 +263,11 @@ static inline bool balloon_page_movable(struct page *page) return false; } +static inline bool isolated_balloon_page(struct page *page) +{ + return false; +} + static inline bool balloon_page_isolate(struct page *page) { return false; diff --git a/mm/migrate.c b/mm/migrate.c index 9c8d5f59d30b..a26bccd44ccb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -107,7 +107,7 @@ void putback_movable_pages(struct list_head *l) list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - if (unlikely(balloon_page_movable(page))) + if (unlikely(isolated_balloon_page(page))) balloon_page_putback(page); else putback_lru_page(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index beb35778c69f..53f2f82f83ae 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -48,6 +48,7 @@ #include #include +#include #include "internal.h" @@ -1113,7 +1114,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_cache(page) && !PageDirty(page)) { + if (page_is_file_cache(page) && !PageDirty(page) && + !isolated_balloon_page(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } -- cgit v1.2.3-71-gd317 From 45906723578f768d029ba7774cd97b435f2c5125 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 30 Sep 2013 14:16:41 +0200 Subject: skbuff: size of hole is wrong in a comment Since commit c93bdd0e03e8 ("netvm: allow skb allocation to use PFMEMALLOC reserves"), hole size is one bit less than what is written in the comment. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2ddb48d9312c..c2d89335f637 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -498,7 +498,7 @@ struct sk_buff { * headers if needed */ __u8 encapsulation:1; - /* 7/9 bit hole (depending on ndisc_nodetype presence) */ + /* 6/8 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL -- cgit v1.2.3-71-gd317