From dd935f44a40f8fb02aff2cc0df2269c92422df1c Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Wed, 28 Aug 2013 21:43:09 -0700
Subject: libceph: add function to ensure notifies are complete

Without a way to flush the osd client's notify workqueue, a watch
event that is unregistered could continue receiving callbacks
indefinitely.

Unregistering the event simply means no new notifies are added to the
queue, but there may still be events in the queue that will call the
watch callback for the event. If the queue is flushed after the event
is unregistered, the caller can be sure no more watch callbacks will
occur for the canceled watch.

Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 include/linux/ceph/osd_client.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ce6df39f60ff..8f47625a0661 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -335,6 +335,8 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 				  struct ceph_osd_request *req);
 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
 
+extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			       struct ceph_vino vino,
 			       struct ceph_file_layout *layout,
-- 
cgit v1.2.3-71-gd317


From 7bd36014460f793c19e7d6c94dab67b0afcfcb7f Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Sep 2013 16:50:56 -0700
Subject: timekeeping: Fix HRTICK related deadlock from ntp lock changes

Gerlando Falauto reported that when HRTICK is enabled, it is
possible to trigger system deadlocks. These were hard to
reproduce, as HRTICK has been broken in the past, but seemed
to be connected to the timekeeping_seq lock.

Since seqlock/seqcount's aren't supported w/ lockdep, I added
some extra spinlock based locking and triggered the following
lockdep output:

[   15.849182] ntpd/4062 is trying to acquire lock:
[   15.849765]  (&(&pool->lock)->rlock){..-...}, at: [<ffffffff810aa9b5>] __queue_work+0x145/0x480
[   15.850051]
[   15.850051] but task is already holding lock:
[   15.850051]  (timekeeper_lock){-.-.-.}, at: [<ffffffff810df6df>] do_adjtimex+0x7f/0x100

<snip>

[   15.850051] Chain exists of: &(&pool->lock)->rlock --> &p->pi_lock --> timekeeper_lock
[   15.850051]  Possible unsafe locking scenario:
[   15.850051]
[   15.850051]        CPU0                    CPU1
[   15.850051]        ----                    ----
[   15.850051]   lock(timekeeper_lock);
[   15.850051]                                lock(&p->pi_lock);
[   15.850051] lock(timekeeper_lock);
[   15.850051] lock(&(&pool->lock)->rlock);
[   15.850051]
[   15.850051]  *** DEADLOCK ***

The deadlock was introduced by 06c017fdd4dc48451a ("timekeeping:
Hold timekeepering locks in do_adjtimex and hardpps") in 3.10

This patch avoids this deadlock, by moving the call to
schedule_delayed_work() outside of the timekeeper lock
critical section.

Reported-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Tested-by: Lin Ming <minggr@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: stable <stable@vger.kernel.org> #3.11, 3.10
Link: http://lkml.kernel.org/r/1378943457-27314-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/timex.h     | 1 +
 kernel/time/ntp.c         | 6 ++----
 kernel/time/timekeeping.c | 2 ++
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index b3726e61368e..dd3edd7dfc94 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -141,6 +141,7 @@ extern int do_adjtimex(struct timex *);
 extern void hardpps(const struct timespec *, const struct timespec *);
 
 int read_current_timer(unsigned long *timer_val);
+void ntp_notify_cmos_timer(void);
 
 /* The clock frequency of the i8253/i8254 PIT */
 #define PIT_TICK_RATE 1193182ul
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8f5b3b98577b..bb2215174f05 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -516,13 +516,13 @@ static void sync_cmos_clock(struct work_struct *work)
 	schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
 }
 
-static void notify_cmos_timer(void)
+void ntp_notify_cmos_timer(void)
 {
 	schedule_delayed_work(&sync_cmos_work, 0);
 }
 
 #else
-static inline void notify_cmos_timer(void) { }
+void ntp_notify_cmos_timer(void) { }
 #endif
 
 
@@ -687,8 +687,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
 	if (!(time_status & STA_NANO))
 		txc->time.tv_usec /= NSEC_PER_USEC;
 
-	notify_cmos_timer();
-
 	return result;
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 48b9fffabdc2..947ba25a95a0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1703,6 +1703,8 @@ int do_adjtimex(struct timex *txc)
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
+	ntp_notify_cmos_timer();
+
 	return ret;
 }
 
-- 
cgit v1.2.3-71-gd317


From 35e4237973665c8a1ad4e3f7a7cb87573deaa24a Mon Sep 17 00:00:00 2001
From: Joseph Gasparakis <joseph.gasparakis@intel.com>
Date: Fri, 13 Sep 2013 07:34:13 -0700
Subject: vxlan: Fix sparse warnings

This patch fixes sparse warnings when incorrectly handling the port number
and using int instead of unsigned int iterating through &vn->sock_list[].
Keeping the port as __be16 also makes things clearer wrt endianess.
Also, it was pointed out that vxlan_get_rx_port() had unnecessary checks
which got removed.

Signed-off-by: Joseph Gasparakis <joseph.gasparakis@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 18 ++++++++----------
 include/linux/netdevice.h |  8 ++++----
 2 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index bf64b4191dcc..2400b1beddd5 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -564,7 +564,7 @@ static void vxlan_notify_add_rx_port(struct sock *sk)
 	struct net_device *dev;
 	struct net *net = sock_net(sk);
 	sa_family_t sa_family = sk->sk_family;
-	u16 port = htons(inet_sk(sk)->inet_sport);
+	__be16 port = inet_sk(sk)->inet_sport;
 
 	rcu_read_lock();
 	for_each_netdev_rcu(net, dev) {
@@ -581,7 +581,7 @@ static void vxlan_notify_del_rx_port(struct sock *sk)
 	struct net_device *dev;
 	struct net *net = sock_net(sk);
 	sa_family_t sa_family = sk->sk_family;
-	u16 port = htons(inet_sk(sk)->inet_sport);
+	__be16 port = inet_sk(sk)->inet_sport;
 
 	rcu_read_lock();
 	for_each_netdev_rcu(net, dev) {
@@ -2021,7 +2021,8 @@ static struct device_type vxlan_type = {
 };
 
 /* Calls the ndo_add_vxlan_port of the caller in order to
- * supply the listening VXLAN udp ports.
+ * supply the listening VXLAN udp ports. Callers are expected
+ * to implement the ndo_add_vxlan_port.
  */
 void vxlan_get_rx_port(struct net_device *dev)
 {
@@ -2029,16 +2030,13 @@ void vxlan_get_rx_port(struct net_device *dev)
 	struct net *net = dev_net(dev);
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 	sa_family_t sa_family;
-	u16 port;
-	int i;
-
-	if (!dev || !dev->netdev_ops || !dev->netdev_ops->ndo_add_vxlan_port)
-		return;
+	__be16 port;
+	unsigned int i;
 
 	spin_lock(&vn->sock_lock);
 	for (i = 0; i < PORT_HASH_SIZE; ++i) {
-		hlist_for_each_entry_rcu(vs, vs_head(net, i), hlist) {
-			port = htons(inet_sk(vs->sock->sk)->inet_sport);
+		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
+			port = inet_sk(vs->sock->sk)->inet_sport;
 			sa_family = vs->sock->sk->sk_family;
 			dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family,
 							    port);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 041b42a305f6..3de49aca4519 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -950,14 +950,14 @@ struct netdev_phys_port_id {
  *	multiple net devices on single physical port.
  *
  * void (*ndo_add_vxlan_port)(struct  net_device *dev,
- *			      sa_family_t sa_family, __u16 port);
+ *			      sa_family_t sa_family, __be16 port);
  *	Called by vxlan to notiy a driver about the UDP port and socket
  *	address family that vxlan is listnening to. It is called only when
  *	a new port starts listening. The operation is protected by the
  *	vxlan_net->sock_lock.
  *
  * void (*ndo_del_vxlan_port)(struct  net_device *dev,
- *			      sa_family_t sa_family, __u16 port);
+ *			      sa_family_t sa_family, __be16 port);
  *	Called by vxlan to notify the driver about a UDP port and socket
  *	address family that vxlan is not listening to anymore. The operation
  *	is protected by the vxlan_net->sock_lock.
@@ -1093,10 +1093,10 @@ struct net_device_ops {
 							struct netdev_phys_port_id *ppid);
 	void			(*ndo_add_vxlan_port)(struct  net_device *dev,
 						      sa_family_t sa_family,
-						      __u16 port);
+						      __be16 port);
 	void			(*ndo_del_vxlan_port)(struct  net_device *dev,
 						      sa_family_t sa_family,
-						      __u16 port);
+						      __be16 port);
 };
 
 /*
-- 
cgit v1.2.3-71-gd317


From 0f1799ba1a5db4c48b72ac2da2dc70d8c190a73d Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 16 Sep 2013 20:04:53 +0200
Subject: netfilter: ipset: Consistent userspace testing with nomatch flag

The "nomatch" commandline flag should invert the matching at testing,
similarly to the --return-nomatch flag of the "set" match of iptables.
Until now it worked with the elements with "nomatch" flag only. From
now on it works with elements without the flag too, i.e:

 # ipset n test hash:net
 # ipset a test 10.0.0.0/24 nomatch
 # ipset t test 10.0.0.1
 10.0.0.1 is NOT in set test.
 # ipset t test 10.0.0.1 nomatch
 10.0.0.1 is in set test.

 # ipset a test 192.168.0.0/24
 # ipset t test 192.168.0.1
 192.168.0.1 is in set test.
 # ipset t test 192.168.0.1 nomatch
 192.168.0.1 is NOT in set test.

 Before the patch the results were

 ...
 # ipset t test 192.168.0.1
 192.168.0.1 is in set test.
 # ipset t test 192.168.0.1 nomatch
 192.168.0.1 is in set test.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h      | 6 ++++--
 net/netfilter/ipset/ip_set_core.c           | 3 +--
 net/netfilter/ipset/ip_set_hash_ipportnet.c | 4 ++--
 net/netfilter/ipset/ip_set_hash_net.c       | 4 ++--
 net/netfilter/ipset/ip_set_hash_netiface.c  | 4 ++--
 net/netfilter/ipset/ip_set_hash_netport.c   | 4 ++--
 6 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index d80e2753847c..9ac9fbde7b61 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -296,10 +296,12 @@ ip_set_eexist(int ret, u32 flags)
 
 /* Match elements marked with nomatch */
 static inline bool
-ip_set_enomatch(int ret, u32 flags, enum ipset_adt adt)
+ip_set_enomatch(int ret, u32 flags, enum ipset_adt adt, struct ip_set *set)
 {
 	return adt == IPSET_TEST &&
-	       ret == -ENOTEMPTY && ((flags >> 16) & IPSET_FLAG_NOMATCH);
+	       (set->type->features & IPSET_TYPE_NOMATCH) &&
+	       ((flags >> 16) & IPSET_FLAG_NOMATCH) &&
+	       (ret > 0 || ret == -ENOTEMPTY);
 }
 
 /* Check the NLA_F_NET_BYTEORDER flag */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index f77139007983..c8c303c3386f 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1489,8 +1489,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
 	if (ret == -EAGAIN)
 		ret = 1;
 
-	return (ret < 0 && ret != -ENOTEMPTY) ? ret :
-		ret > 0 ? 0 : -IPSET_ERR_EXIST;
+	return ret > 0 ? 0 : -IPSET_ERR_EXIST;
 }
 
 /* Get headed data of a set */
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index c6a525373be4..f15f3e28b9c3 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -260,7 +260,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 		e.ip = htonl(ip);
 		e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1));
 		ret = adtfn(set, &e, &ext, &ext, flags);
-		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		return ip_set_enomatch(ret, flags, adt, set) ? -ret :
 		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -544,7 +544,7 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
 		ret = adtfn(set, &e, &ext, &ext, flags);
-		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		return ip_set_enomatch(ret, flags, adt, set) ? -ret :
 		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index da740ceb56ae..223e9f546d0f 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -199,7 +199,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
 		e.ip = htonl(ip & ip_set_hostmask(e.cidr));
 		ret = adtfn(set, &e, &ext, &ext, flags);
-		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		return ip_set_enomatch(ret, flags, adt, set) ? -ret:
 		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -396,7 +396,7 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	ret = adtfn(set, &e, &ext, &ext, flags);
 
-	return ip_set_enomatch(ret, flags, adt) ? 1 :
+	return ip_set_enomatch(ret, flags, adt, set) ? -ret :
 	       ip_set_eexist(ret, flags) ? 0 : ret;
 }
 
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 84ae6f6ce624..7d798d5d5cd3 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -368,7 +368,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
 		e.ip = htonl(ip & ip_set_hostmask(e.cidr));
 		ret = adtfn(set, &e, &ext, &ext, flags);
-		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		return ip_set_enomatch(ret, flags, adt, set) ? -ret :
 		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -634,7 +634,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	ret = adtfn(set, &e, &ext, &ext, flags);
 
-	return ip_set_enomatch(ret, flags, adt) ? 1 :
+	return ip_set_enomatch(ret, flags, adt, set) ? -ret :
 	       ip_set_eexist(ret, flags) ? 0 : ret;
 }
 
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 9a0869853be5..09d6690bee6f 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -244,7 +244,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) {
 		e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1));
 		ret = adtfn(set, &e, &ext, &ext, flags);
-		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		return ip_set_enomatch(ret, flags, adt, set) ? -ret :
 		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -489,7 +489,7 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
 		ret = adtfn(set, &e, &ext, &ext, flags);
-		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		return ip_set_enomatch(ret, flags, adt, set) ? -ret :
 		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
-- 
cgit v1.2.3-71-gd317


From 7c2330f1afe13b3a934140857bc8060d00103a89 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 16 Sep 2013 18:08:02 -0700
Subject: regulator: fix fatal kernel-doc error

Fix fatal kernel-doc error in <linux/regulator/driver.h>:

Error(include/linux/regulator/driver.h:52): cannot understand prototype: 'struct regulator_linear_range '

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
[Rewrote first line -- broonie]
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 include/linux/regulator/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 67e13aa5a478..9bdad43ad228 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -40,6 +40,8 @@ enum regulator_status {
 };
 
 /**
+ * struct regulator_linear_range - specify linear voltage ranges
+ *
  * Specify a range of voltages for regulator_map_linar_range() and
  * regulator_list_linear_range().
  *
-- 
cgit v1.2.3-71-gd317


From f84cb8a46a771f36a04a02c61ea635c968ed5f6a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 19 Sep 2013 12:13:58 -0400
Subject: dm mpath: disable WRITE SAME if it fails

Workaround the SCSI layer's problematic WRITE SAME heuristics by
disabling WRITE SAME in the DM multipath device's queue_limits if an
underlying device disabled it.

The WRITE SAME heuristics, with both the original commit 5db44863b6eb
("[SCSI] sd: Implement support for WRITE SAME") and the updated commit
66c28f971 ("[SCSI] sd: Update WRITE SAME heuristics"), default to enabling
WRITE SAME(10) even without successfully determining it is supported.
After the first failed WRITE SAME the SCSI layer will disable WRITE SAME
for the device (by setting sdkp->device->no_write_same which results in
'max_write_same_sectors' in device's queue_limits to be set to 0).

When a device is stacked ontop of such a SCSI device any changes to that
SCSI device's queue_limits do not automatically propagate up the stack.
As such, a DM multipath device will not have its WRITE SAME support
disabled.  This causes the block layer to continue to issue WRITE SAME
requests to the mpath device which causes paths to fail and (if mpath IO
isn't configured to queue when no paths are available) it will result in
actual IO errors to the upper layers.

This fix doesn't help configurations that have additional devices
stacked ontop of the mpath device (e.g. LVM created linear DM devices
ontop).  A proper fix that restacks all the queue_limits from the bottom
of the device stack up will need to be explored if SCSI will continue to
use this model of optimistically allowing op codes and then disabling
them after they fail for the first time.

Before this patch:

EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null)
device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121)
device-mapper: multipath: XXX snitm debugging: failing WRITE SAME IO with error=-121
end_request: critical target error, dev dm-6, sector 528
dm-6: WRITE SAME failed. Manually zeroing.
device-mapper: multipath: Failing path 8:112.
end_request: I/O error, dev dm-6, sector 4616
dm-6: WRITE SAME failed. Manually zeroing.
end_request: I/O error, dev dm-6, sector 4616
end_request: I/O error, dev dm-6, sector 5640
end_request: I/O error, dev dm-6, sector 6664
end_request: I/O error, dev dm-6, sector 7688
end_request: I/O error, dev dm-6, sector 524288
Buffer I/O error on device dm-6, logical block 65536
lost page write due to I/O error on dm-6
JBD2: Error -5 detected when updating journal superblock for dm-6-8.
end_request: I/O error, dev dm-6, sector 524296
Aborting journal on device dm-6-8.
end_request: I/O error, dev dm-6, sector 524288
Buffer I/O error on device dm-6, logical block 65536
lost page write due to I/O error on dm-6
JBD2: Error -5 detected when updating journal superblock for dm-6-8.

# cat /sys/block/sdh/queue/write_same_max_bytes
0
# cat /sys/block/dm-6/queue/write_same_max_bytes
33553920

After this patch:

EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null)
device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121)
device-mapper: multipath: XXX snitm debugging: WRITE SAME I/O failed with error=-121
end_request: critical target error, dev dm-6, sector 528
dm-6: WRITE SAME failed. Manually zeroing.

# cat /sys/block/sdh/queue/write_same_max_bytes
0
# cat /sys/block/dm-6/queue/write_same_max_bytes
0

It should be noted that WRITE SAME support wasn't enabled in DM
multipath until v3.10.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: stable@vger.kernel.org # 3.10+
---
 drivers/md/dm-mpath.c         | 11 ++++++++++-
 drivers/md/dm.c               | 11 +++++++++++
 include/linux/device-mapper.h |  3 ++-
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 075747e25f77..e08be94959fd 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1299,8 +1299,17 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	if (!error && !clone->errors)
 		return 0;	/* I/O complete */
 
-	if (noretry_error(error))
+	if (noretry_error(error)) {
+		if ((clone->cmd_flags & REQ_WRITE_SAME) &&
+		    !clone->q->limits.max_write_same_sectors) {
+			struct queue_limits *limits;
+
+			/* device doesn't really support WRITE SAME, disable it */
+			limits = dm_get_queue_limits(dm_table_get_md(m->ti->table));
+			limits->max_write_same_sectors = 0;
+		}
 		return error;
+	}
 
 	if (mpio->pgpath)
 		fail_path(mpio->pgpath);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6a5e9ed2fcc3..7b0ccdc5d65c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2277,6 +2277,17 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 	return md->immutable_target_type;
 }
 
+/*
+ * The queue_limits are only valid as long as you have a reference
+ * count on 'md'.
+ */
+struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
+{
+	BUG_ON(!atomic_read(&md->holders));
+	return &md->queue->limits;
+}
+EXPORT_SYMBOL_GPL(dm_get_queue_limits);
+
 /*
  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
  */
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 653073de09e3..ed419c62dde1 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -406,13 +406,14 @@ int dm_noflush_suspending(struct dm_target *ti);
 union map_info *dm_get_mapinfo(struct bio *bio);
 union map_info *dm_get_rq_mapinfo(struct request *rq);
 
+struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
+
 /*
  * Geometry functions.
  */
 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
 
-
 /*-----------------------------------------------------------------
  * Functions for manipulating device-mapper tables.
  *---------------------------------------------------------------*/
-- 
cgit v1.2.3-71-gd317


From c26d436cbf7a9549ec1073480a2e3f0d3f64e02d Mon Sep 17 00:00:00 2001
From: Andre Naujoks <nautsch2@gmail.com>
Date: Fri, 13 Sep 2013 19:37:12 +0200
Subject: lib: introduce upper case hex ascii helpers

To be able to use the hex ascii functions in case sensitive environments
the array hex_asc_upper[] and the needed functions for hex_byte_pack_upper()
are introduced.

Signed-off-by: Andre Naujoks <nautsch2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h | 11 +++++++++++
 lib/hexdump.c          |  2 ++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 482ad2d84a32..672ddc4de4af 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -439,6 +439,17 @@ static inline char *hex_byte_pack(char *buf, u8 byte)
 	return buf;
 }
 
+extern const char hex_asc_upper[];
+#define hex_asc_upper_lo(x)	hex_asc_upper[((x) & 0x0f)]
+#define hex_asc_upper_hi(x)	hex_asc_upper[((x) & 0xf0) >> 4]
+
+static inline char *hex_byte_pack_upper(char *buf, u8 byte)
+{
+	*buf++ = hex_asc_upper_hi(byte);
+	*buf++ = hex_asc_upper_lo(byte);
+	return buf;
+}
+
 static inline char * __deprecated pack_hex_byte(char *buf, u8 byte)
 {
 	return hex_byte_pack(buf, byte);
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 3f0494c9d57a..8499c810909a 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -14,6 +14,8 @@
 
 const char hex_asc[] = "0123456789abcdef";
 EXPORT_SYMBOL(hex_asc);
+const char hex_asc_upper[] = "0123456789ABCDEF";
+EXPORT_SYMBOL(hex_asc_upper);
 
 /**
  * hex_to_bin - convert a hex digit to its real value
-- 
cgit v1.2.3-71-gd317


From 75afb352991ff1cd3cf5955bfe611de6d83a0c87 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Sat, 21 Sep 2013 13:57:47 -0600
Subject: block: Add nr_bios to block_rq_remap tracepoint

Adding the number of bios in a remapped request to 'block_rq_remap'
tracepoint.

Request remapper clones bios in a request to track the completion
status of each bio. So the number of bios can be useful information
for investigation.

Related discussions:
  http://www.redhat.com/archives/dm-devel/2013-August/msg00084.html
  http://www.redhat.com/archives/dm-devel/2013-September/msg00024.html

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h       | 11 +++++++++++
 include/trace/events/block.h |  6 ++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2fdb4a451b49..0e6f765aa1f5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -862,6 +862,17 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
 	return blk_queue_get_max_sectors(q, rq->cmd_flags);
 }
 
+static inline unsigned int blk_rq_count_bios(struct request *rq)
+{
+	unsigned int nr_bios = 0;
+	struct bio *bio;
+
+	__rq_for_each_bio(bio, rq)
+		nr_bios++;
+
+	return nr_bios;
+}
+
 /*
  * Request issue related functions.
  */
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 60ae7c3db912..4c2301d2ef1a 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -618,6 +618,7 @@ TRACE_EVENT(block_rq_remap,
 		__field( unsigned int,	nr_sector	)
 		__field( dev_t,		old_dev		)
 		__field( sector_t,	old_sector	)
+		__field( unsigned int,	nr_bios		)
 		__array( char,		rwbs,	RWBS_LEN)
 	),
 
@@ -627,15 +628,16 @@ TRACE_EVENT(block_rq_remap,
 		__entry->nr_sector	= blk_rq_sectors(rq);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
+		__entry->nr_bios	= blk_rq_count_bios(rq);
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 	),
 
-	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector,
 		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
-		  (unsigned long long)__entry->old_sector)
+		  (unsigned long long)__entry->old_sector, __entry->nr_bios)
 );
 
 #endif /* _TRACE_BLOCK_H */
-- 
cgit v1.2.3-71-gd317


From 82aeef0bf03684b377678c00c05e613f30dca39c Mon Sep 17 00:00:00 2001
From: "Li, Zhen-Hua" <zhen-hual@hp.com>
Date: Fri, 13 Sep 2013 14:27:32 +0800
Subject: x86/iommu: correct ICS register offset

According to Intel Vt-D specs, the offset of Invalidation complete
status register should be 0x9C, not 0x98.

See Intel's VT-d spec, Revision 1.3, Chapter 10.4, Page 98;

Signed-off-by: Li, Zhen-Hua <zhen-hual@hp.com>
Signed-off-by: Joerg Roedel <joro@8bytes.org>
---
 include/linux/intel-iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 78e2ada50cd5..d380c5e68008 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -55,7 +55,7 @@
 #define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
 #define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
 #define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
-#define DMAR_ICS_REG	0x98	/* Invalidation complete status register */
+#define DMAR_ICS_REG	0x9c	/* Invalidation complete status register */
 #define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
 
 #define OFFSET_STRIDE		(9)
-- 
cgit v1.2.3-71-gd317


From 9809b18fcf6b8d8ec4d3643677345907e6b50eca Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 24 Sep 2013 15:27:30 -0700
Subject: watchdog: update watchdog_thresh properly

watchdog_tresh controls how often nmi perf event counter checks per-cpu
hrtimer_interrupts counter and blows up if the counter hasn't changed
since the last check.  The counter is updated by per-cpu
watchdog_hrtimer hrtimer which is scheduled with 2/5 watchdog_thresh
period which guarantees that hrtimer is scheduled 2 times per the main
period.  Both hrtimer and perf event are started together when the
watchdog is enabled.

So far so good.  But...

But what happens when watchdog_thresh is updated from sysctl handler?

proc_dowatchdog will set a new sampling period and hrtimer callback
(watchdog_timer_fn) will use the new value in the next round.  The
problem, however, is that nobody tells the perf event that the sampling
period has changed so it is ticking with the period configured when it
has been set up.

This might result in an ear ripping dissonance between perf and hrtimer
parts if the watchdog_thresh is increased.  And even worse it might lead
to KABOOM if the watchdog is configured to panic on such a spurious
lockup.

This patch fixes the issue by updating both nmi perf even counter and
hrtimers if the threshold value has changed.

The nmi one is disabled and then reinitialized from scratch.  This has
an unpleasant side effect that the allocation of the new event might
fail theoretically so the hard lockup detector would be disabled for
such cpus.  On the other hand such a memory allocation failure is very
unlikely because the original event is deallocated right before.

It would be much nicer if we just changed perf event period but there
doesn't seem to be any API to do that right now.  It is also unfortunate
that perf_event_alloc uses GFP_KERNEL allocation unconditionally so we
cannot use on_each_cpu() and do the same thing from the per-cpu context.
The update from the current CPU should be safe because
perf_event_disable removes the event atomically before it clears the
per-cpu watchdog_ev so it cannot change anything under running handler
feet.

The hrtimer is simply restarted (thanks to Don Zickus who has pointed
this out) if it is queued because we cannot rely it will fire&adopt to
the new sampling period before a new nmi event triggers (when the
treshold is decreased).

[akpm@linux-foundation.org: the UP version of __smp_call_function_single ended up in the wrong place]
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smp.h |  6 ++++++
 kernel/watchdog.c   | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 56 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index cfb7ca094b38..731f5237d5f4 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -155,6 +155,12 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
 
 static inline void kick_all_cpus_sync(void) {  }
 
+static inline void __smp_call_function_single(int cpuid,
+		struct call_single_data *data, int wait)
+{
+	on_each_cpu(data->func, data->info, wait);
+}
+
 #endif /* !SMP */
 
 /*
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index ced7d0609931..4431610f049a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = {
 	.unpark			= watchdog_enable,
 };
 
-static int watchdog_enable_all_cpus(void)
+static void restart_watchdog_hrtimer(void *info)
+{
+	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+	int ret;
+
+	/*
+	 * No need to cancel and restart hrtimer if it is currently executing
+	 * because it will reprogram itself with the new period now.
+	 * We should never see it unqueued here because we are running per-cpu
+	 * with interrupts disabled.
+	 */
+	ret = hrtimer_try_to_cancel(hrtimer);
+	if (ret == 1)
+		hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+				HRTIMER_MODE_REL_PINNED);
+}
+
+static void update_timers(int cpu)
+{
+	struct call_single_data data = {.func = restart_watchdog_hrtimer};
+	/*
+	 * Make sure that perf event counter will adopt to a new
+	 * sampling period. Updating the sampling period directly would
+	 * be much nicer but we do not have an API for that now so
+	 * let's use a big hammer.
+	 * Hrtimer will adopt the new period on the next tick but this
+	 * might be late already so we have to restart the timer as well.
+	 */
+	watchdog_nmi_disable(cpu);
+	__smp_call_function_single(cpu, &data, 1);
+	watchdog_nmi_enable(cpu);
+}
+
+static void update_timers_all_cpus(void)
+{
+	int cpu;
+
+	get_online_cpus();
+	preempt_disable();
+	for_each_online_cpu(cpu)
+		update_timers(cpu);
+	preempt_enable();
+	put_online_cpus();
+}
+
+static int watchdog_enable_all_cpus(bool sample_period_changed)
 {
 	int err = 0;
 
@@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void)
 			pr_err("Failed to create watchdog threads, disabled\n");
 		else
 			watchdog_running = 1;
+	} else if (sample_period_changed) {
+		update_timers_all_cpus();
 	}
 
 	return err;
@@ -537,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 	 * watchdog_*_all_cpus() function takes care of this.
 	 */
 	if (watchdog_user_enabled && watchdog_thresh)
-		err = watchdog_enable_all_cpus();
+		err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
 	else
 		watchdog_disable_all_cpus();
 
@@ -557,5 +604,5 @@ void __init lockup_detector_init(void)
 	set_sample_period();
 
 	if (watchdog_user_enabled)
-		watchdog_enable_all_cpus();
+		watchdog_enable_all_cpus(false);
 }
-- 
cgit v1.2.3-71-gd317


From 694fbc0fe78518d06efa63910bf4ecee660e7852 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:37 -0700
Subject: revert "memcg: enhance memcg iterator to support predicates"

Revert commit de57780dc659 ("memcg: enhance memcg iterator to support
predicates")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 49 ++++----------------------------
 mm/memcontrol.c            | 70 ++++++++++------------------------------------
 mm/vmscan.c                | 16 +++++++----
 3 files changed, 32 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 60e95872da29..ef2b9bd7fafa 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -53,23 +53,6 @@ struct mem_cgroup_reclaim_cookie {
 	unsigned int generation;
 };
 
-enum mem_cgroup_filter_t {
-	VISIT,		/* visit current node */
-	SKIP,		/* skip the current node and continue traversal */
-	SKIP_TREE,	/* skip the whole subtree and continue traversal */
-};
-
-/*
- * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to
- * iterate through the hierarchy tree. Each tree element is checked by the
- * predicate before it is returned by the iterator. If a filter returns
- * SKIP or SKIP_TREE then the iterator code continues traversal (with the
- * next node down the hierarchy or the next node that doesn't belong under the
- * memcg's subtree).
- */
-typedef enum mem_cgroup_filter_t
-(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root);
-
 #ifdef CONFIG_MEMCG
 /*
  * All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -137,18 +120,9 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 	struct page *oldpage, struct page *newpage, bool migration_ok);
 
-struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
-				   struct mem_cgroup *prev,
-				   struct mem_cgroup_reclaim_cookie *reclaim,
-				   mem_cgroup_iter_filter cond);
-
-static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
-				   struct mem_cgroup *prev,
-				   struct mem_cgroup_reclaim_cookie *reclaim)
-{
-	return mem_cgroup_iter_cond(root, prev, reclaim, NULL);
-}
-
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
+				   struct mem_cgroup *,
+				   struct mem_cgroup_reclaim_cookie *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 
 /*
@@ -260,8 +234,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 	mem_cgroup_update_page_stat(page, idx, -1);
 }
 
-enum mem_cgroup_filter_t
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 		struct mem_cgroup *root);
 
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
@@ -376,15 +349,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 		struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 }
-static inline struct mem_cgroup *
-mem_cgroup_iter_cond(struct mem_cgroup *root,
-		struct mem_cgroup *prev,
-		struct mem_cgroup_reclaim_cookie *reclaim,
-		mem_cgroup_iter_filter cond)
-{
-	/* first call must return non-NULL, second return NULL */
-	return (struct mem_cgroup *)(unsigned long)!prev;
-}
 
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
@@ -471,11 +435,10 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 }
 
 static inline
-enum mem_cgroup_filter_t
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 		struct mem_cgroup *root)
 {
-	return VISIT;
+	return false;
 }
 
 static inline void mem_cgroup_split_huge_fixup(struct page *head)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 916892c2b8e0..65e7bec4b0f0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -862,15 +862,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return memcg;
 }
 
-static enum mem_cgroup_filter_t
-mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
-		mem_cgroup_iter_filter cond)
-{
-	if (!cond)
-		return VISIT;
-	return cond(memcg, root);
-}
-
 /*
  * Returns a next (in a pre-order walk) alive memcg (with elevated css
  * ref. count) or NULL if the whole root's subtree has been visited.
@@ -878,7 +869,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
  * helper function to be used by mem_cgroup_iter
  */
 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-		struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
+		struct mem_cgroup *last_visited)
 {
 	struct cgroup_subsys_state *prev_css, *next_css;
 
@@ -896,31 +887,11 @@ skip_node:
 	if (next_css) {
 		struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
 
-		switch (mem_cgroup_filter(mem, root, cond)) {
-		case SKIP:
+		if (css_tryget(&mem->css))
+			return mem;
+		else {
 			prev_css = next_css;
 			goto skip_node;
-		case SKIP_TREE:
-			if (mem == root)
-				return NULL;
-			/*
-			 * css_rightmost_descendant is not an optimal way to
-			 * skip through a subtree (especially for imbalanced
-			 * trees leaning to right) but that's what we have right
-			 * now. More effective solution would be traversing
-			 * right-up for first non-NULL without calling
-			 * css_next_descendant_pre afterwards.
-			 */
-			prev_css = css_rightmost_descendant(next_css);
-			goto skip_node;
-		case VISIT:
-			if (css_tryget(&mem->css))
-				return mem;
-			else {
-				prev_css = next_css;
-				goto skip_node;
-			}
-			break;
 		}
 	}
 
@@ -984,7 +955,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
  * @root: hierarchy root
  * @prev: previously returned memcg, NULL on first invocation
  * @reclaim: cookie for shared reclaim walks, NULL for full walks
- * @cond: filter for visited nodes, NULL for no filter
  *
  * Returns references to children of the hierarchy below @root, or
  * @root itself, or %NULL after a full round-trip.
@@ -997,18 +967,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
  * divide up the memcgs in the hierarchy among all concurrent
  * reclaimers operating on the same zone and priority.
  */
-struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
-				   struct mem_cgroup_reclaim_cookie *reclaim,
-				   mem_cgroup_iter_filter cond)
+				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *last_visited = NULL;
 
-	if (mem_cgroup_disabled()) {
-		/* first call must return non-NULL, second return NULL */
-		return (struct mem_cgroup *)(unsigned long)!prev;
-	}
+	if (mem_cgroup_disabled())
+		return NULL;
 
 	if (!root)
 		root = root_mem_cgroup;
@@ -1019,9 +986,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
 			goto out_css_put;
-		if (mem_cgroup_filter(root, root, cond) == VISIT)
-			return root;
-		return NULL;
+		return root;
 	}
 
 	rcu_read_lock();
@@ -1044,7 +1009,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
 			last_visited = mem_cgroup_iter_load(iter, root, &seq);
 		}
 
-		memcg = __mem_cgroup_iter_next(root, last_visited, cond);
+		memcg = __mem_cgroup_iter_next(root, last_visited);
 
 		if (reclaim) {
 			mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1055,11 +1020,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
 				reclaim->generation = iter->generation;
 		}
 
-		/*
-		 * We have finished the whole tree walk or no group has been
-		 * visited because filter told us to skip the root node.
-		 */
-		if (!memcg && (prev || (cond && !last_visited)))
+		if (prev && !memcg)
 			goto out_unlock;
 	}
 out_unlock:
@@ -1804,14 +1765,13 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
  *	a) it is over its soft limit
  *	b) any parent up the hierarchy is over its soft limit
  */
-enum mem_cgroup_filter_t
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 		struct mem_cgroup *root)
 {
 	struct mem_cgroup *parent = memcg;
 
 	if (res_counter_soft_limit_excess(&memcg->res))
-		return VISIT;
+		return true;
 
 	/*
 	 * If any parent up to the root in the hierarchy is over its soft limit
@@ -1819,12 +1779,12 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 	 */
 	while ((parent = parent_mem_cgroup(parent))) {
 		if (res_counter_soft_limit_excess(&parent->res))
-			return VISIT;
+			return true;
 		if (parent == root)
 			break;
 	}
 
-	return SKIP;
+	return false;
 }
 
 static DEFINE_SPINLOCK(memcg_oom_lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cdd300b81485..17a7134de4d7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2185,16 +2185,21 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 			.zone = zone,
 			.priority = sc->priority,
 		};
-		struct mem_cgroup *memcg = NULL;
-		mem_cgroup_iter_filter filter = (soft_reclaim) ?
-			mem_cgroup_soft_reclaim_eligible : NULL;
+		struct mem_cgroup *memcg;
 
 		nr_reclaimed = sc->nr_reclaimed;
 		nr_scanned = sc->nr_scanned;
 
-		while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
+		memcg = mem_cgroup_iter(root, NULL, &reclaim);
+		do {
 			struct lruvec *lruvec;
 
+			if (soft_reclaim &&
+			    !mem_cgroup_soft_reclaim_eligible(memcg, root)) {
+				memcg = mem_cgroup_iter(root, memcg, &reclaim);
+				continue;
+			}
+
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2214,7 +2219,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 				mem_cgroup_iter_break(root, memcg);
 				break;
 			}
-		}
+			memcg = mem_cgroup_iter(root, memcg, &reclaim);
+		} while (memcg);
 
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 			   sc->nr_scanned - nr_scanned,
-- 
cgit v1.2.3-71-gd317


From b1aff7fcf86c88472b0a70f15d89d7a4adba07bb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:38 -0700
Subject: revert "vmscan, memcg: do softlimit reclaim also for targeted
 reclaim"

Revert commit a5b7c87f9207 ("vmscan, memcg: do softlimit reclaim also
for targeted reclaim")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ++----
 mm/memcontrol.c            | 14 +++++---------
 mm/vmscan.c                |  4 ++--
 3 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ef2b9bd7fafa..6054c9f3a5e8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -234,8 +234,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 	mem_cgroup_update_page_stat(page, idx, -1);
 }
 
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
-		struct mem_cgroup *root);
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg);
 
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
@@ -435,8 +434,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 }
 
 static inline
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
-		struct mem_cgroup *root)
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg)
 {
 	return false;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 65e7bec4b0f0..47cdc7eb1a6b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1760,13 +1760,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 #endif
 
 /*
- * A group is eligible for the soft limit reclaim under the given root
- * hierarchy if
- *	a) it is over its soft limit
+ * A group is eligible for the soft limit reclaim if
+ * 	a) it is over its soft limit
  *	b) any parent up the hierarchy is over its soft limit
  */
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
-		struct mem_cgroup *root)
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *parent = memcg;
 
@@ -1774,14 +1772,12 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 		return true;
 
 	/*
-	 * If any parent up to the root in the hierarchy is over its soft limit
-	 * then we have to obey and reclaim from this group as well.
+	 * If any parent up the hierarchy is over its soft limit then we
+	 * have to obey and reclaim from this group as well.
 	 */
 	while ((parent = parent_mem_cgroup(parent))) {
 		if (res_counter_soft_limit_excess(&parent->res))
 			return true;
-		if (parent == root)
-			break;
 	}
 
 	return false;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 17a7134de4d7..0e081cada4ba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -142,7 +142,7 @@ static bool global_reclaim(struct scan_control *sc)
 
 static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
 {
-	return !mem_cgroup_disabled();
+	return !mem_cgroup_disabled() && global_reclaim(sc);
 }
 #else
 static bool global_reclaim(struct scan_control *sc)
@@ -2195,7 +2195,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 			struct lruvec *lruvec;
 
 			if (soft_reclaim &&
-			    !mem_cgroup_soft_reclaim_eligible(memcg, root)) {
+			    !mem_cgroup_soft_reclaim_eligible(memcg)) {
 				memcg = mem_cgroup_iter(root, memcg, &reclaim);
 				continue;
 			}
-- 
cgit v1.2.3-71-gd317


From 0608f43da64a1f1c42507304b5f25bc8b1227aa4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:41 -0700
Subject: revert "memcg, vmscan: integrate soft reclaim tighter with zone
 shrinking code"

Revert commit 3b38722efd9f ("memcg, vmscan: integrate soft reclaim
tighter with zone shrinking code")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  10 ++-
 mm/memcontrol.c            | 163 +++++++++++++++++++++++++++++++++++++++------
 mm/vmscan.c                |  62 ++++++++---------
 3 files changed, 175 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6054c9f3a5e8..ecc82b37c4cc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -234,7 +234,9 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 	mem_cgroup_update_page_stat(page, idx, -1);
 }
 
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg);
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+						gfp_t gfp_mask,
+						unsigned long *total_scanned);
 
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
@@ -434,9 +436,11 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 }
 
 static inline
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+					    gfp_t gfp_mask,
+					    unsigned long *total_scanned)
 {
-	return false;
+	return 0;
 }
 
 static inline void mem_cgroup_split_huge_fixup(struct page *head)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 852dbec07ce6..1c52ddbc839b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1991,28 +1991,57 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 }
 #endif
 
-/*
- * A group is eligible for the soft limit reclaim if
- * 	a) it is over its soft limit
- *	b) any parent up the hierarchy is over its soft limit
- */
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *parent = memcg;
-
-	if (res_counter_soft_limit_excess(&memcg->res))
-		return true;
-
-	/*
-	 * If any parent up the hierarchy is over its soft limit then we
-	 * have to obey and reclaim from this group as well.
-	 */
-	while ((parent = parent_mem_cgroup(parent))) {
-		if (res_counter_soft_limit_excess(&parent->res))
-			return true;
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
+				   struct zone *zone,
+				   gfp_t gfp_mask,
+				   unsigned long *total_scanned)
+{
+	struct mem_cgroup *victim = NULL;
+	int total = 0;
+	int loop = 0;
+	unsigned long excess;
+	unsigned long nr_scanned;
+	struct mem_cgroup_reclaim_cookie reclaim = {
+		.zone = zone,
+		.priority = 0,
+	};
+
+	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+
+	while (1) {
+		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
+		if (!victim) {
+			loop++;
+			if (loop >= 2) {
+				/*
+				 * If we have not been able to reclaim
+				 * anything, it might because there are
+				 * no reclaimable pages under this hierarchy
+				 */
+				if (!total)
+					break;
+				/*
+				 * We want to do more targeted reclaim.
+				 * excess >> 2 is not to excessive so as to
+				 * reclaim too much, nor too less that we keep
+				 * coming back to reclaim from this cgroup
+				 */
+				if (total >= (excess >> 2) ||
+					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
+					break;
+			}
+			continue;
+		}
+		if (!mem_cgroup_reclaimable(victim, false))
+			continue;
+		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+						     zone, &nr_scanned);
+		*total_scanned += nr_scanned;
+		if (!res_counter_soft_limit_excess(&root_memcg->res))
+			break;
 	}
-
-	return false;
+	mem_cgroup_iter_break(root_memcg, victim);
+	return total;
 }
 
 static DEFINE_SPINLOCK(memcg_oom_lock);
@@ -4761,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 	return ret;
 }
 
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+					    gfp_t gfp_mask,
+					    unsigned long *total_scanned)
+{
+	unsigned long nr_reclaimed = 0;
+	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+	unsigned long reclaimed;
+	int loop = 0;
+	struct mem_cgroup_tree_per_zone *mctz;
+	unsigned long long excess;
+	unsigned long nr_scanned;
+
+	if (order > 0)
+		return 0;
+
+	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
+	/*
+	 * This loop can run a while, specially if mem_cgroup's continuously
+	 * keep exceeding their soft limit and putting the system under
+	 * pressure
+	 */
+	do {
+		if (next_mz)
+			mz = next_mz;
+		else
+			mz = mem_cgroup_largest_soft_limit_node(mctz);
+		if (!mz)
+			break;
+
+		nr_scanned = 0;
+		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
+						    gfp_mask, &nr_scanned);
+		nr_reclaimed += reclaimed;
+		*total_scanned += nr_scanned;
+		spin_lock(&mctz->lock);
+
+		/*
+		 * If we failed to reclaim anything from this memory cgroup
+		 * it is time to move on to the next cgroup
+		 */
+		next_mz = NULL;
+		if (!reclaimed) {
+			do {
+				/*
+				 * Loop until we find yet another one.
+				 *
+				 * By the time we get the soft_limit lock
+				 * again, someone might have aded the
+				 * group back on the RB tree. Iterate to
+				 * make sure we get a different mem.
+				 * mem_cgroup_largest_soft_limit_node returns
+				 * NULL if no other cgroup is present on
+				 * the tree
+				 */
+				next_mz =
+				__mem_cgroup_largest_soft_limit_node(mctz);
+				if (next_mz == mz)
+					css_put(&next_mz->memcg->css);
+				else /* next_mz == NULL or other memcg */
+					break;
+			} while (1);
+		}
+		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+		excess = res_counter_soft_limit_excess(&mz->memcg->res);
+		/*
+		 * One school of thought says that we should not add
+		 * back the node to the tree if reclaim returns 0.
+		 * But our reclaim could return 0, simply because due
+		 * to priority we are exposing a smaller subset of
+		 * memory to reclaim from. Consider this as a longer
+		 * term TODO.
+		 */
+		/* If excess == 0, no tree ops */
+		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
+		spin_unlock(&mctz->lock);
+		css_put(&mz->memcg->css);
+		loop++;
+		/*
+		 * Could not reclaim anything and there are no more
+		 * mem cgroups to try or we seem to be looping without
+		 * reclaiming anything.
+		 */
+		if (!nr_reclaimed &&
+			(next_mz == NULL ||
+			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+			break;
+	} while (!nr_reclaimed);
+	if (next_mz)
+		css_put(&next_mz->memcg->css);
+	return nr_reclaimed;
+}
+
 /**
  * mem_cgroup_force_empty_list - clears LRU of a group
  * @memcg: group to clear
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0e081cada4ba..beb35778c69f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,21 +139,11 @@ static bool global_reclaim(struct scan_control *sc)
 {
 	return !sc->target_mem_cgroup;
 }
-
-static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
-{
-	return !mem_cgroup_disabled() && global_reclaim(sc);
-}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
 	return true;
 }
-
-static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
-{
-	return false;
-}
 #endif
 
 unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -2174,8 +2164,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 }
 
-static void
-__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_reclaimed, nr_scanned;
 
@@ -2194,12 +2183,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 		do {
 			struct lruvec *lruvec;
 
-			if (soft_reclaim &&
-			    !mem_cgroup_soft_reclaim_eligible(memcg)) {
-				memcg = mem_cgroup_iter(root, memcg, &reclaim);
-				continue;
-			}
-
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2230,24 +2213,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 					 sc->nr_scanned - nr_scanned, sc));
 }
 
-
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
-{
-	bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
-	unsigned long nr_scanned = sc->nr_scanned;
-
-	__shrink_zone(zone, sc, do_soft_reclaim);
-
-	/*
-	 * No group is over the soft limit or those that are do not have
-	 * pages in the zone we are reclaiming so we have to reclaim everybody
-	 */
-	if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
-		__shrink_zone(zone, sc, false);
-		return;
-	}
-}
-
 /* Returns true if compaction should go ahead for a high-order request */
 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 {
@@ -2309,6 +2274,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
+	unsigned long nr_soft_reclaimed;
+	unsigned long nr_soft_scanned;
 	bool aborted_reclaim = false;
 
 	/*
@@ -2348,6 +2315,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 					continue;
 				}
 			}
+			/*
+			 * This steals pages from memory cgroups over softlimit
+			 * and returns the number of reclaimed pages and
+			 * scanned pages. This works for global memory pressure
+			 * and balancing, not for a memcg's limit.
+			 */
+			nr_soft_scanned = 0;
+			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+						sc->order, sc->gfp_mask,
+						&nr_soft_scanned);
+			sc->nr_reclaimed += nr_soft_reclaimed;
+			sc->nr_scanned += nr_soft_scanned;
 			/* need some check for avoid more shrink_zone() */
 		}
 
@@ -2941,6 +2920,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 {
 	int i;
 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
+	unsigned long nr_soft_reclaimed;
+	unsigned long nr_soft_scanned;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
@@ -3055,6 +3036,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
 			sc.nr_scanned = 0;
 
+			nr_soft_scanned = 0;
+			/*
+			 * Call soft limit reclaim before calling shrink_zone.
+			 */
+			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+							order, sc.gfp_mask,
+							&nr_soft_scanned);
+			sc.nr_reclaimed += nr_soft_reclaimed;
+
 			/*
 			 * There should be no need to raise the scanning
 			 * priority if enough pages are already being scanned
-- 
cgit v1.2.3-71-gd317


From b0b8c960ffcc5bfc82d1a09ad931673e3a36c15a Mon Sep 17 00:00:00 2001
From: Rob Herring <rob.herring@calxeda.com>
Date: Wed, 4 Sep 2013 10:52:57 -0500
Subject: of: clean-up ifdefs in of_irq.h

Much of of_irq.h is needlessly ifdef'ed. Clean this up and minimize the
amount ifdef'ed code. This fixes some  build warnings when CONFIG_OF
is not enabled (seen on i386 and x86_64):

include/linux/of_irq.h:82:7: warning: 'struct device_node' declared inside parameter list [enabled by default]
include/linux/of_irq.h:82:7: warning: its scope is only this definition or declaration, which is probably not what you want [enabled by default]
include/linux/of_irq.h:87:47: warning: 'struct device_node' declared inside parameter list [enabled by default]

Compile tested on i386, sparc and arm.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Grant Likely <grant.likely@linaro.org>
Signed-off-by: Rob Herring <rob.herring@calxeda.com>
---
 include/linux/of_irq.h | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 535cecf1e02f..fcd63baee5f2 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -1,8 +1,6 @@
 #ifndef __OF_IRQ_H
 #define __OF_IRQ_H
 
-#if defined(CONFIG_OF)
-struct of_irq;
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/irq.h>
@@ -10,14 +8,6 @@ struct of_irq;
 #include <linux/ioport.h>
 #include <linux/of.h>
 
-/*
- * irq_of_parse_and_map() is used by all OF enabled platforms; but SPARC
- * implements it differently.  However, the prototype is the same for all,
- * so declare it here regardless of the CONFIG_OF_IRQ setting.
- */
-extern unsigned int irq_of_parse_and_map(struct device_node *node, int index);
-
-#if defined(CONFIG_OF_IRQ)
 /**
  * of_irq - container for device_node/irq_specifier pair for an irq controller
  * @controller: pointer to interrupt controller device tree node
@@ -71,11 +61,17 @@ extern int of_irq_to_resource(struct device_node *dev, int index,
 extern int of_irq_count(struct device_node *dev);
 extern int of_irq_to_resource_table(struct device_node *dev,
 		struct resource *res, int nr_irqs);
-extern struct device_node *of_irq_find_parent(struct device_node *child);
 
 extern void of_irq_init(const struct of_device_id *matches);
 
-#endif /* CONFIG_OF_IRQ */
+#if defined(CONFIG_OF)
+/*
+ * irq_of_parse_and_map() is used by all OF enabled platforms; but SPARC
+ * implements it differently.  However, the prototype is the same for all,
+ * so declare it here regardless of the CONFIG_OF_IRQ setting.
+ */
+extern unsigned int irq_of_parse_and_map(struct device_node *node, int index);
+extern struct device_node *of_irq_find_parent(struct device_node *child);
 
 #else /* !CONFIG_OF */
 static inline unsigned int irq_of_parse_and_map(struct device_node *dev,
-- 
cgit v1.2.3-71-gd317


From 5bc2afc2b53fc73f154e6344cd898585628e6d27 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 23 Sep 2013 18:01:28 -0400
Subject: NFSv4: Honour the 'opened' parameter in the atomic_open() filesystem
 method

Determine if we've created a new file by examining the directory change
attribute and/or the O_EXCL flag.

This fixes a regression when doing a non-exclusive create of a new file.
If the FILE_CREATED flag is not set, the atomic_open() command will
perform full file access permissions checks instead of just checking
for MAY_OPEN.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c            |  2 +-
 fs/nfs/nfs4file.c       |  3 ++-
 fs/nfs/nfs4proc.c       | 26 +++++++++++++++++++-------
 include/linux/nfs_xdr.h |  3 ++-
 4 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 854a8f05a610..02b0df769e2d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1458,7 +1458,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 
 	trace_nfs_atomic_open_enter(dir, ctx, open_flags);
 	nfs_block_sillyrename(dentry->d_parent);
-	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened);
 	nfs_unblock_sillyrename(dentry->d_parent);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e5b804dd944c..77efaf15ec90 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -19,6 +19,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 	struct inode *dir;
 	unsigned openflags = filp->f_flags;
 	struct iattr attr;
+	int opened = 0;
 	int err;
 
 	/*
@@ -55,7 +56,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 		nfs_wb_all(inode);
 	}
 
-	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
+	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		switch (err) {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 989bb9d3074d..488ef9b5c51a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -912,6 +912,7 @@ struct nfs4_opendata {
 	struct iattr attrs;
 	unsigned long timestamp;
 	unsigned int rpc_done : 1;
+	unsigned int file_created : 1;
 	unsigned int is_recover : 1;
 	int rpc_status;
 	int cancelled;
@@ -1946,8 +1947,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 
 	nfs_fattr_map_and_free_names(server, &data->f_attr);
 
-	if (o_arg->open_flags & O_CREAT)
+	if (o_arg->open_flags & O_CREAT) {
 		update_changeattr(dir, &o_res->cinfo);
+		if (o_arg->open_flags & O_EXCL)
+			data->file_created = 1;
+		else if (o_res->cinfo.before != o_res->cinfo.after)
+			data->file_created = 1;
+	}
 	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
 		server->caps &= ~NFS_CAP_POSIX_LOCK;
 	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
@@ -2191,7 +2197,8 @@ static int _nfs4_do_open(struct inode *dir,
 			struct nfs_open_context *ctx,
 			int flags,
 			struct iattr *sattr,
-			struct nfs4_label *label)
+			struct nfs4_label *label,
+			int *opened)
 {
 	struct nfs4_state_owner  *sp;
 	struct nfs4_state     *state = NULL;
@@ -2261,6 +2268,8 @@ static int _nfs4_do_open(struct inode *dir,
 			nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
 		}
 	}
+	if (opendata->file_created)
+		*opened |= FILE_CREATED;
 
 	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
 		*ctx_th = opendata->f_attr.mdsthreshold;
@@ -2289,7 +2298,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 					struct nfs_open_context *ctx,
 					int flags,
 					struct iattr *sattr,
-					struct nfs4_label *label)
+					struct nfs4_label *label,
+					int *opened)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs4_exception exception = { };
@@ -2297,7 +2307,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 	int status;
 
 	do {
-		status = _nfs4_do_open(dir, ctx, flags, sattr, label);
+		status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened);
 		res = ctx->state;
 		trace_nfs4_open_file(ctx, flags, status);
 		if (status == 0)
@@ -2659,7 +2669,8 @@ out:
 }
 
 static struct inode *
-nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
+nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
+		int open_flags, struct iattr *attr, int *opened)
 {
 	struct nfs4_state *state;
 	struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
@@ -2667,7 +2678,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags
 	label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
 
 	/* Protect against concurrent sillydeletes */
-	state = nfs4_do_open(dir, ctx, open_flags, attr, label);
+	state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened);
 
 	nfs4_label_release_security(label);
 
@@ -3332,6 +3343,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	struct nfs4_label l, *ilabel = NULL;
 	struct nfs_open_context *ctx;
 	struct nfs4_state *state;
+	int opened = 0;
 	int status = 0;
 
 	ctx = alloc_nfs_open_context(dentry, FMODE_READ);
@@ -3341,7 +3353,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
 
 	sattr->ia_mode &= ~current_umask();
-	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
+	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
 		goto out;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 01fd84b566f7..49f52c8f4422 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1455,7 +1455,8 @@ struct nfs_rpc_ops {
 	struct inode * (*open_context) (struct inode *dir,
 				struct nfs_open_context *ctx,
 				int open_flags,
-				struct iattr *iattr);
+				struct iattr *iattr,
+				int *);
 	int (*have_delegation)(struct inode *, fmode_t);
 	int (*return_delegation)(struct inode *);
 	struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *);
-- 
cgit v1.2.3-71-gd317


From 2bedea8f26c92e2610f2f67889144990749461e0 Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend@broadcom.com>
Date: Wed, 25 Sep 2013 12:11:02 +0200
Subject: bcma: make bcma_core_pci_{up,down}() callable from atomic context

This patch removes the bcma_core_pci_power_save() call from
the bcma_core_pci_{up,down}() functions as it tries to schedule
thus requiring to call them from non-atomic context. The function
bcma_core_pci_power_save() is now exported so the calling module
can explicitly use it in non-atomic context. This fixes the
'scheduling while atomic' issue reported by Tod Jackson and
Joe Perches.

[   13.210710] BUG: scheduling while atomic: dhcpcd/1800/0x00000202
[   13.210718] Modules linked in: brcmsmac nouveau coretemp kvm_intel kvm cordic brcmutil bcma dell_wmi atl1c ttm mxm_wmi wmi
[   13.210756] CPU: 2 PID: 1800 Comm: dhcpcd Not tainted 3.11.0-wl #1
[   13.210762] Hardware name: Alienware M11x R2/M11x R2, BIOS A04 11/23/2010
[   13.210767]  ffff880177c92c40 ffff880170fd1948 ffffffff8169af5b 0000000000000007
[   13.210777]  ffff880170fd1ab0 ffff880170fd1958 ffffffff81697ee2 ffff880170fd19d8
[   13.210785]  ffffffff816a19f5 00000000000f4240 000000000000d080 ffff880170fd1fd8
[   13.210794] Call Trace:
[   13.210813]  [<ffffffff8169af5b>] dump_stack+0x4f/0x84
[   13.210826]  [<ffffffff81697ee2>] __schedule_bug+0x43/0x51
[   13.210837]  [<ffffffff816a19f5>] __schedule+0x6e5/0x810
[   13.210845]  [<ffffffff816a1c34>] schedule+0x24/0x70
[   13.210855]  [<ffffffff816a04fc>] schedule_hrtimeout_range_clock+0x10c/0x150
[   13.210867]  [<ffffffff810684e0>] ? update_rmtp+0x60/0x60
[   13.210877]  [<ffffffff8106915f>] ? hrtimer_start_range_ns+0xf/0x20
[   13.210887]  [<ffffffff816a054e>] schedule_hrtimeout_range+0xe/0x10
[   13.210897]  [<ffffffff8104f6fb>] usleep_range+0x3b/0x40
[   13.210910]  [<ffffffffa00371af>] bcma_pcie_mdio_set_phy.isra.3+0x4f/0x80 [bcma]
[   13.210921]  [<ffffffffa003729f>] bcma_pcie_mdio_write.isra.4+0xbf/0xd0 [bcma]
[   13.210932]  [<ffffffffa0037498>] bcma_pcie_mdio_writeread.isra.6.constprop.13+0x18/0x30 [bcma]
[   13.210942]  [<ffffffffa00374ee>] bcma_core_pci_power_save+0x3e/0x80 [bcma]
[   13.210953]  [<ffffffffa003765d>] bcma_core_pci_up+0x2d/0x60 [bcma]
[   13.210975]  [<ffffffffa03dc17c>] brcms_c_up+0xfc/0x430 [brcmsmac]
[   13.210989]  [<ffffffffa03d1a7d>] brcms_up+0x1d/0x20 [brcmsmac]
[   13.211003]  [<ffffffffa03d2498>] brcms_ops_start+0x298/0x340 [brcmsmac]
[   13.211020]  [<ffffffff81600a12>] ? cfg80211_netdev_notifier_call+0xd2/0x5f0
[   13.211030]  [<ffffffff815fa53d>] ? packet_notifier+0xad/0x1d0
[   13.211064]  [<ffffffff81656e75>] ieee80211_do_open+0x325/0xf80
[   13.211076]  [<ffffffff8106ac09>] ? __raw_notifier_call_chain+0x9/0x10
[   13.211086]  [<ffffffff81657b41>] ieee80211_open+0x71/0x80
[   13.211101]  [<ffffffff81526267>] __dev_open+0x87/0xe0
[   13.211109]  [<ffffffff8152650c>] __dev_change_flags+0x9c/0x180
[   13.211117]  [<ffffffff815266a3>] dev_change_flags+0x23/0x70
[   13.211127]  [<ffffffff8158cd68>] devinet_ioctl+0x5b8/0x6a0
[   13.211136]  [<ffffffff8158d5c5>] inet_ioctl+0x75/0x90
[   13.211147]  [<ffffffff8150b38b>] sock_do_ioctl+0x2b/0x70
[   13.211155]  [<ffffffff8150b681>] sock_ioctl+0x71/0x2a0
[   13.211169]  [<ffffffff8114ed47>] do_vfs_ioctl+0x87/0x520
[   13.211180]  [<ffffffff8113f159>] ? ____fput+0x9/0x10
[   13.211198]  [<ffffffff8106228c>] ? task_work_run+0x9c/0xd0
[   13.211202]  [<ffffffff8114f271>] SyS_ioctl+0x91/0xb0
[   13.211208]  [<ffffffff816aa252>] system_call_fastpath+0x16/0x1b
[   13.211217] NOHZ: local_softirq_pending 202

The issue was introduced in v3.11 kernel by following commit:

commit aa51e598d04c6acf5477934cd6383f5a17ce9029
Author: Hauke Mehrtens <hauke@hauke-m.de>
Date:   Sat Aug 24 00:32:31 2013 +0200

    brcmsmac: use bcma PCIe up and down functions

    replace the calls to bcma_core_pci_extend_L1timer() by calls to the
    newly introduced bcma_core_pci_ip() and bcma_core_pci_down()

    Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
    Cc: Arend van Spriel <arend@broadcom.com>
    Signed-off-by: John W. Linville <linville@tuxdriver.com>

This fix has been discussed with Hauke Mehrtens [1] selection
option 3) and is intended for v3.12.

Ref:
[1] http://mid.gmane.org/5239B12D.3040206@hauke-m.de

Cc: <stable@vger.kernel.org> # 3.11.x
Cc: Tod Jackson <tod.jackson@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Rafal Milecki <zajec5@gmail.com>
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Reviewed-by: Hante Meuleman <meuleman@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/bcma/driver_pci.c            | 49 +++++++++++++++++++-----------------
 include/linux/bcma/bcma_driver_pci.h |  1 +
 2 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/driver_pci.c b/drivers/bcma/driver_pci.c
index c9fd6943ce45..50329d1057ed 100644
--- a/drivers/bcma/driver_pci.c
+++ b/drivers/bcma/driver_pci.c
@@ -210,25 +210,6 @@ static void bcma_core_pci_config_fixup(struct bcma_drv_pci *pc)
 	}
 }
 
-static void bcma_core_pci_power_save(struct bcma_drv_pci *pc, bool up)
-{
-	u16 data;
-
-	if (pc->core->id.rev >= 15 && pc->core->id.rev <= 20) {
-		data = up ? 0x74 : 0x7C;
-		bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1,
-					 BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7F64);
-		bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1,
-					 BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data);
-	} else if (pc->core->id.rev >= 21 && pc->core->id.rev <= 22) {
-		data = up ? 0x75 : 0x7D;
-		bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1,
-					 BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7E65);
-		bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1,
-					 BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data);
-	}
-}
-
 /**************************************************
  * Init.
  **************************************************/
@@ -255,6 +236,32 @@ void bcma_core_pci_init(struct bcma_drv_pci *pc)
 		bcma_core_pci_clientmode_init(pc);
 }
 
+void bcma_core_pci_power_save(struct bcma_bus *bus, bool up)
+{
+	struct bcma_drv_pci *pc;
+	u16 data;
+
+	if (bus->hosttype != BCMA_HOSTTYPE_PCI)
+		return;
+
+	pc = &bus->drv_pci[0];
+
+	if (pc->core->id.rev >= 15 && pc->core->id.rev <= 20) {
+		data = up ? 0x74 : 0x7C;
+		bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1,
+					 BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7F64);
+		bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1,
+					 BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data);
+	} else if (pc->core->id.rev >= 21 && pc->core->id.rev <= 22) {
+		data = up ? 0x75 : 0x7D;
+		bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1,
+					 BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7E65);
+		bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1,
+					 BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data);
+	}
+}
+EXPORT_SYMBOL_GPL(bcma_core_pci_power_save);
+
 int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc, struct bcma_device *core,
 			  bool enable)
 {
@@ -310,8 +317,6 @@ void bcma_core_pci_up(struct bcma_bus *bus)
 
 	pc = &bus->drv_pci[0];
 
-	bcma_core_pci_power_save(pc, true);
-
 	bcma_core_pci_extend_L1timer(pc, true);
 }
 EXPORT_SYMBOL_GPL(bcma_core_pci_up);
@@ -326,7 +331,5 @@ void bcma_core_pci_down(struct bcma_bus *bus)
 	pc = &bus->drv_pci[0];
 
 	bcma_core_pci_extend_L1timer(pc, false);
-
-	bcma_core_pci_power_save(pc, false);
 }
 EXPORT_SYMBOL_GPL(bcma_core_pci_down);
diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h
index d66033f418c9..0333e605ea0d 100644
--- a/include/linux/bcma/bcma_driver_pci.h
+++ b/include/linux/bcma/bcma_driver_pci.h
@@ -242,6 +242,7 @@ extern int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc,
 				 struct bcma_device *core, bool enable);
 extern void bcma_core_pci_up(struct bcma_bus *bus);
 extern void bcma_core_pci_down(struct bcma_bus *bus);
+extern void bcma_core_pci_power_save(struct bcma_bus *bus, bool up);
 
 extern int bcma_core_pci_pcibios_map_irq(const struct pci_dev *dev);
 extern int bcma_core_pci_plat_dev_init(struct pci_dev *dev);
-- 
cgit v1.2.3-71-gd317


From 3a4916050ba2e0f1d114ef540abdf02b2b173e61 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Fri, 6 Sep 2013 11:49:56 -0700
Subject: Drivers: hv: util: Correctly support ws2008R2 and earlier

The current code does not correctly negotiate the version numbers for the util
driver when hosted on earlier hosts. The version numbers presented by this
driver were not compatible with the version numbers supported by Windows Server
2008. Fix this problem.

I would like to thank Olaf Hering (ohering@suse.com) for identifying the problem.

Reported-by: Olaf Hering <ohering@suse.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/hv_kvp.c      | 38 ++++++++++++++++++--------
 drivers/hv/hv_snapshot.c |  6 ++--
 drivers/hv/hv_util.c     | 71 ++++++++++++++++++++++++++++++++++++------------
 include/linux/hyperv.h   |  7 +++--
 4 files changed, 88 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 28b03325b872..09988b289622 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -32,13 +32,17 @@
 /*
  * Pre win8 version numbers used in ws2008 and ws 2008 r2 (win7)
  */
+#define WS2008_SRV_MAJOR	1
+#define WS2008_SRV_MINOR	0
+#define WS2008_SRV_VERSION     (WS2008_SRV_MAJOR << 16 | WS2008_SRV_MINOR)
+
 #define WIN7_SRV_MAJOR   3
 #define WIN7_SRV_MINOR   0
-#define WIN7_SRV_MAJOR_MINOR     (WIN7_SRV_MAJOR << 16 | WIN7_SRV_MINOR)
+#define WIN7_SRV_VERSION     (WIN7_SRV_MAJOR << 16 | WIN7_SRV_MINOR)
 
 #define WIN8_SRV_MAJOR   4
 #define WIN8_SRV_MINOR   0
-#define WIN8_SRV_MAJOR_MINOR     (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
+#define WIN8_SRV_VERSION     (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
 
 /*
  * Global state maintained for transaction that is being processed.
@@ -587,6 +591,8 @@ void hv_kvp_onchannelcallback(void *context)
 
 	struct icmsg_hdr *icmsghdrp;
 	struct icmsg_negotiate *negop = NULL;
+	int util_fw_version;
+	int kvp_srv_version;
 
 	if (kvp_transaction.active) {
 		/*
@@ -606,17 +612,26 @@ void hv_kvp_onchannelcallback(void *context)
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
 			/*
-			 * We start with win8 version and if the host cannot
-			 * support that we use the previous version.
+			 * Based on the host, select appropriate
+			 * framework and service versions we will
+			 * negotiate.
 			 */
-			if (vmbus_prep_negotiate_resp(icmsghdrp, negop,
-				 recv_buffer, UTIL_FW_MAJOR_MINOR,
-				 WIN8_SRV_MAJOR_MINOR))
-				goto done;
-
+			switch (vmbus_proto_version) {
+			case (VERSION_WS2008):
+				util_fw_version = UTIL_WS2K8_FW_VERSION;
+				kvp_srv_version = WS2008_SRV_VERSION;
+				break;
+			case (VERSION_WIN7):
+				util_fw_version = UTIL_FW_VERSION;
+				kvp_srv_version = WIN7_SRV_VERSION;
+				break;
+			default:
+				util_fw_version = UTIL_FW_VERSION;
+				kvp_srv_version = WIN8_SRV_VERSION;
+			}
 			vmbus_prep_negotiate_resp(icmsghdrp, negop,
-				 recv_buffer, UTIL_FW_MAJOR_MINOR,
-				 WIN7_SRV_MAJOR_MINOR);
+				 recv_buffer, util_fw_version,
+				 kvp_srv_version);
 
 		} else {
 			kvp_msg = (struct hv_kvp_msg *)&recv_buffer[
@@ -649,7 +664,6 @@ void hv_kvp_onchannelcallback(void *context)
 			return;
 
 		}
-done:
 
 		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
 			| ICMSGHDRFLAG_RESPONSE;
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index e4572f3f2834..0c3546224376 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -26,7 +26,7 @@
 
 #define VSS_MAJOR  5
 #define VSS_MINOR  0
-#define VSS_MAJOR_MINOR    (VSS_MAJOR << 16 | VSS_MINOR)
+#define VSS_VERSION    (VSS_MAJOR << 16 | VSS_MINOR)
 
 
 
@@ -190,8 +190,8 @@ void hv_vss_onchannelcallback(void *context)
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
 			vmbus_prep_negotiate_resp(icmsghdrp, negop,
-				 recv_buffer, UTIL_FW_MAJOR_MINOR,
-				 VSS_MAJOR_MINOR);
+				 recv_buffer, UTIL_FW_VERSION,
+				 VSS_VERSION);
 		} else {
 			vss_msg = (struct hv_vss_msg *)&recv_buffer[
 				sizeof(struct vmbuspipe_hdr) +
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index cb82233541b1..273e3ddb3a20 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -28,17 +28,32 @@
 #include <linux/reboot.h>
 #include <linux/hyperv.h>
 
-#define SHUTDOWN_MAJOR	3
-#define SHUTDOWN_MINOR  0
-#define SHUTDOWN_MAJOR_MINOR	(SHUTDOWN_MAJOR << 16 | SHUTDOWN_MINOR)
 
-#define TIMESYNCH_MAJOR	3
-#define TIMESYNCH_MINOR 0
-#define TIMESYNCH_MAJOR_MINOR	(TIMESYNCH_MAJOR << 16 | TIMESYNCH_MINOR)
+#define SD_MAJOR	3
+#define SD_MINOR	0
+#define SD_VERSION	(SD_MAJOR << 16 | SD_MINOR)
 
-#define HEARTBEAT_MAJOR	3
-#define HEARTBEAT_MINOR 0
-#define HEARTBEAT_MAJOR_MINOR	(HEARTBEAT_MAJOR << 16 | HEARTBEAT_MINOR)
+#define SD_WS2008_MAJOR		1
+#define SD_WS2008_VERSION	(SD_WS2008_MAJOR << 16 | SD_MINOR)
+
+#define TS_MAJOR	3
+#define TS_MINOR	0
+#define TS_VERSION	(TS_MAJOR << 16 | TS_MINOR)
+
+#define TS_WS2008_MAJOR		1
+#define TS_WS2008_VERSION	(TS_WS2008_MAJOR << 16 | TS_MINOR)
+
+#define HB_MAJOR	3
+#define HB_MINOR 0
+#define HB_VERSION	(HB_MAJOR << 16 | HB_MINOR)
+
+#define HB_WS2008_MAJOR	1
+#define HB_WS2008_VERSION	(HB_WS2008_MAJOR << 16 | HB_MINOR)
+
+static int sd_srv_version;
+static int ts_srv_version;
+static int hb_srv_version;
+static int util_fw_version;
 
 static void shutdown_onchannelcallback(void *context);
 static struct hv_util_service util_shutdown = {
@@ -99,8 +114,8 @@ static void shutdown_onchannelcallback(void *context)
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
 			vmbus_prep_negotiate_resp(icmsghdrp, negop,
-					shut_txf_buf, UTIL_FW_MAJOR_MINOR,
-					SHUTDOWN_MAJOR_MINOR);
+					shut_txf_buf, util_fw_version,
+					sd_srv_version);
 		} else {
 			shutdown_msg =
 				(struct shutdown_msg_data *)&shut_txf_buf[
@@ -216,6 +231,7 @@ static void timesync_onchannelcallback(void *context)
 	struct icmsg_hdr *icmsghdrp;
 	struct ictimesync_data *timedatap;
 	u8 *time_txf_buf = util_timesynch.recv_buffer;
+	struct icmsg_negotiate *negop = NULL;
 
 	vmbus_recvpacket(channel, time_txf_buf,
 			 PAGE_SIZE, &recvlen, &requestid);
@@ -225,9 +241,10 @@ static void timesync_onchannelcallback(void *context)
 				sizeof(struct vmbuspipe_hdr)];
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			vmbus_prep_negotiate_resp(icmsghdrp, NULL, time_txf_buf,
-						UTIL_FW_MAJOR_MINOR,
-						TIMESYNCH_MAJOR_MINOR);
+			vmbus_prep_negotiate_resp(icmsghdrp, negop,
+						time_txf_buf,
+						util_fw_version,
+						ts_srv_version);
 		} else {
 			timedatap = (struct ictimesync_data *)&time_txf_buf[
 				sizeof(struct vmbuspipe_hdr) +
@@ -257,6 +274,7 @@ static void heartbeat_onchannelcallback(void *context)
 	struct icmsg_hdr *icmsghdrp;
 	struct heartbeat_msg_data *heartbeat_msg;
 	u8 *hbeat_txf_buf = util_heartbeat.recv_buffer;
+	struct icmsg_negotiate *negop = NULL;
 
 	vmbus_recvpacket(channel, hbeat_txf_buf,
 			 PAGE_SIZE, &recvlen, &requestid);
@@ -266,9 +284,9 @@ static void heartbeat_onchannelcallback(void *context)
 				sizeof(struct vmbuspipe_hdr)];
 
 		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-			vmbus_prep_negotiate_resp(icmsghdrp, NULL,
-				hbeat_txf_buf, UTIL_FW_MAJOR_MINOR,
-				HEARTBEAT_MAJOR_MINOR);
+			vmbus_prep_negotiate_resp(icmsghdrp, negop,
+				hbeat_txf_buf, util_fw_version,
+				hb_srv_version);
 		} else {
 			heartbeat_msg =
 				(struct heartbeat_msg_data *)&hbeat_txf_buf[
@@ -321,6 +339,25 @@ static int util_probe(struct hv_device *dev,
 		goto error;
 
 	hv_set_drvdata(dev, srv);
+	/*
+	 * Based on the host; initialize the framework and
+	 * service version numbers we will negotiate.
+	 */
+	switch (vmbus_proto_version) {
+	case (VERSION_WS2008):
+		util_fw_version = UTIL_WS2K8_FW_VERSION;
+		sd_srv_version = SD_WS2008_VERSION;
+		ts_srv_version = TS_WS2008_VERSION;
+		hb_srv_version = HB_WS2008_VERSION;
+		break;
+
+	default:
+		util_fw_version = UTIL_FW_VERSION;
+		sd_srv_version = SD_VERSION;
+		ts_srv_version = TS_VERSION;
+		hb_srv_version = HB_VERSION;
+	}
+
 	return 0;
 
 error:
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index a3b8b2e2d244..d98503bde7e9 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -30,10 +30,13 @@
 /*
  * Framework version for util services.
  */
+#define UTIL_FW_MINOR  0
+
+#define UTIL_WS2K8_FW_MAJOR  1
+#define UTIL_WS2K8_FW_VERSION     (UTIL_WS2K8_FW_MAJOR << 16 | UTIL_FW_MINOR)
 
 #define UTIL_FW_MAJOR  3
-#define UTIL_FW_MINOR  0
-#define UTIL_FW_MAJOR_MINOR     (UTIL_FW_MAJOR << 16 | UTIL_FW_MINOR)
+#define UTIL_FW_VERSION     (UTIL_FW_MAJOR << 16 | UTIL_FW_MINOR)
 
 
 /*
-- 
cgit v1.2.3-71-gd317


From 083986e8248d978b6c961d3da6beb0c921c68220 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 28 Sep 2013 11:23:59 +0200
Subject: mutex: replace CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX with simple ifdef

Linus suggested to replace

 #ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
 #define arch_mutex_cpu_relax() cpu_relax()
 #endif

with just a simple

  #ifndef arch_mutex_cpu_relax
  # define arch_mutex_cpu_relax() cpu_relax()
  #endif

to get rid of CONFIG_HAVE_CPU_RELAX_SIMPLE. So architectures can
simply define arch_mutex_cpu_relax if they want an architecture
specific function instead of having to add a select statement in
their Kconfig in addition.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/Kconfig                      | 3 ---
 arch/s390/Kconfig                 | 1 -
 arch/s390/include/asm/mutex.h     | 2 --
 arch/s390/include/asm/processor.h | 2 ++
 include/linux/mutex.h             | 6 +++---
 5 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 1feb169274fe..af2cc6eabcc7 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -286,9 +286,6 @@ config HAVE_PERF_USER_STACK_DUMP
 config HAVE_ARCH_JUMP_LABEL
 	bool
 
-config HAVE_ARCH_MUTEX_CPU_RELAX
-	bool
-
 config HAVE_RCU_TABLE_FREE
 	bool
 
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index dcc6ac2d8026..d3fa84070b82 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -102,7 +102,6 @@ config S390
 	select GENERIC_TIME_VSYSCALL_OLD
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
 	select HAVE_ARCH_JUMP_LABEL if !MARCH_G5
-	select HAVE_ARCH_MUTEX_CPU_RELAX
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT
diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h
index 688271f5f2e4..458c1f7fbc18 100644
--- a/arch/s390/include/asm/mutex.h
+++ b/arch/s390/include/asm/mutex.h
@@ -7,5 +7,3 @@
  */
 
 #include <asm-generic/mutex-dec.h>
-
-#define arch_mutex_cpu_relax()	barrier()
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 0eb37505cab1..ca7821f07260 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -198,6 +198,8 @@ static inline void cpu_relax(void)
 	barrier();
 }
 
+#define arch_mutex_cpu_relax()  barrier()
+
 static inline void psw_set_key(unsigned int key)
 {
 	asm volatile("spka 0(%0)" : : "d" (key));
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index ccd4260834c5..bab49da8a0f0 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -15,8 +15,8 @@
 #include <linux/spinlock_types.h>
 #include <linux/linkage.h>
 #include <linux/lockdep.h>
-
 #include <linux/atomic.h>
+#include <asm/processor.h>
 
 /*
  * Simple, straightforward mutexes with strict semantics:
@@ -175,8 +175,8 @@ extern void mutex_unlock(struct mutex *lock);
 
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
-#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
-#define arch_mutex_cpu_relax()	cpu_relax()
+#ifndef arch_mutex_cpu_relax
+# define arch_mutex_cpu_relax() cpu_relax()
 #endif
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 60e453a940ac678565b6641d65f8c18541bb9f28 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Mon, 23 Sep 2013 20:59:35 +0800
Subject: USBNET: fix handling padding packet

Commit 638c5115a7949(USBNET: support DMA SG) introduces DMA SG
if the usb host controller is capable of building packet from
discontinuous buffers, but missed handling padding packet when
building DMA SG.

This patch attachs the pre-allocated padding packet at the
end of the sg list, so padding packet can be sent to device
if drivers require that.

Reported-by: David Laight <David.Laight@aculab.com>
Acked-by: Oliver Neukum <oliver@neukum.org>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c   | 27 +++++++++++++++++++++------
 include/linux/usb/usbnet.h |  1 +
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 7b331e613e02..bf94e10a37c8 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1241,7 +1241,9 @@ static int build_dma_sg(const struct sk_buff *skb, struct urb *urb)
 	if (num_sgs == 1)
 		return 0;
 
-	urb->sg = kmalloc(num_sgs * sizeof(struct scatterlist), GFP_ATOMIC);
+	/* reserve one for zero packet */
+	urb->sg = kmalloc((num_sgs + 1) * sizeof(struct scatterlist),
+			  GFP_ATOMIC);
 	if (!urb->sg)
 		return -ENOMEM;
 
@@ -1305,7 +1307,7 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 		if (build_dma_sg(skb, urb) < 0)
 			goto drop;
 	}
-	entry->length = length = urb->transfer_buffer_length;
+	length = urb->transfer_buffer_length;
 
 	/* don't assume the hardware handles USB_ZERO_PACKET
 	 * NOTE:  strictly conforming cdc-ether devices should expect
@@ -1317,15 +1319,18 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 	if (length % dev->maxpacket == 0) {
 		if (!(info->flags & FLAG_SEND_ZLP)) {
 			if (!(info->flags & FLAG_MULTI_PACKET)) {
-				urb->transfer_buffer_length++;
-				if (skb_tailroom(skb)) {
+				length++;
+				if (skb_tailroom(skb) && !urb->num_sgs) {
 					skb->data[skb->len] = 0;
 					__skb_put(skb, 1);
-				}
+				} else if (urb->num_sgs)
+					sg_set_buf(&urb->sg[urb->num_sgs++],
+							dev->padding_pkt, 1);
 			}
 		} else
 			urb->transfer_flags |= URB_ZERO_PACKET;
 	}
+	entry->length = urb->transfer_buffer_length = length;
 
 	spin_lock_irqsave(&dev->txq.lock, flags);
 	retval = usb_autopm_get_interface_async(dev->intf);
@@ -1509,6 +1514,7 @@ void usbnet_disconnect (struct usb_interface *intf)
 
 	usb_kill_urb(dev->interrupt);
 	usb_free_urb(dev->interrupt);
+	kfree(dev->padding_pkt);
 
 	free_netdev(net);
 }
@@ -1679,9 +1685,16 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	/* initialize max rx_qlen and tx_qlen */
 	usbnet_update_max_qlen(dev);
 
+	if (dev->can_dma_sg && !(info->flags & FLAG_SEND_ZLP) &&
+		!(info->flags & FLAG_MULTI_PACKET)) {
+		dev->padding_pkt = kzalloc(1, GFP_KERNEL);
+		if (!dev->padding_pkt)
+			goto out4;
+	}
+
 	status = register_netdev (net);
 	if (status)
-		goto out4;
+		goto out5;
 	netif_info(dev, probe, dev->net,
 		   "register '%s' at usb-%s-%s, %s, %pM\n",
 		   udev->dev.driver->name,
@@ -1699,6 +1712,8 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 
 	return 0;
 
+out5:
+	kfree(dev->padding_pkt);
 out4:
 	usb_free_urb(dev->interrupt);
 out3:
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 9cb2fe8ca944..e303eef94dd5 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -42,6 +42,7 @@ struct usbnet {
 	struct usb_host_endpoint *status;
 	unsigned		maxpacket;
 	struct timer_list	delay;
+	const char		*padding_pkt;
 
 	/* protocol/interface state */
 	struct net_device	*net;
-- 
cgit v1.2.3-71-gd317


From 117aad1e9e4d97448d1df3f84b08bd65811e6d6a Mon Sep 17 00:00:00 2001
From: Rafael Aquini <aquini@redhat.com>
Date: Mon, 30 Sep 2013 13:45:16 -0700
Subject: mm: avoid reinserting isolated balloon pages into LRU lists

Isolated balloon pages can wrongly end up in LRU lists when
migrate_pages() finishes its round without draining all the isolated
page list.

The same issue can happen when reclaim_clean_pages_from_list() tries to
reclaim pages from an isolated page list, before migration, in the CMA
path.  Such balloon page leak opens a race window against LRU lists
shrinkers that leads us to the following kernel panic:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000028
  IP: [<ffffffff810c2625>] shrink_page_list+0x24e/0x897
  PGD 3cda2067 PUD 3d713067 PMD 0
  Oops: 0000 [#1] SMP
  CPU: 0 PID: 340 Comm: kswapd0 Not tainted 3.12.0-rc1-22626-g4367597 #87
  Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  RIP: shrink_page_list+0x24e/0x897
  RSP: 0000:ffff88003da499b8  EFLAGS: 00010286
  RAX: 0000000000000000 RBX: ffff88003e82bd60 RCX: 00000000000657d5
  RDX: 0000000000000000 RSI: 000000000000031f RDI: ffff88003e82bd40
  RBP: ffff88003da49ab0 R08: 0000000000000001 R09: 0000000081121a45
  R10: ffffffff81121a45 R11: ffff88003c4a9a28 R12: ffff88003e82bd40
  R13: ffff88003da0e800 R14: 0000000000000001 R15: ffff88003da49d58
  FS:  0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000067d9000 CR3: 000000003ace5000 CR4: 00000000000407b0
  Call Trace:
    shrink_inactive_list+0x240/0x3de
    shrink_lruvec+0x3e0/0x566
    __shrink_zone+0x94/0x178
    shrink_zone+0x3a/0x82
    balance_pgdat+0x32a/0x4c2
    kswapd+0x2f0/0x372
    kthread+0xa2/0xaa
    ret_from_fork+0x7c/0xb0
  Code: 80 7d 8f 01 48 83 95 68 ff ff ff 00 4c 89 e7 e8 5a 7b 00 00 48 85 c0 49 89 c5 75 08 80 7d 8f 00 74 3e eb 31 48 8b 80 18 01 00 00 <48> 8b 74 0d 48 8b 78 30 be 02 00 00 00 ff d2 eb
  RIP  [<ffffffff810c2625>] shrink_page_list+0x24e/0x897
   RSP <ffff88003da499b8>
  CR2: 0000000000000028
  ---[ end trace 703d2451af6ffbfd ]---
  Kernel panic - not syncing: Fatal exception

This patch fixes the issue, by assuring the proper tests are made at
putback_movable_pages() & reclaim_clean_pages_from_list() to avoid
isolated balloon pages being wrongly reinserted in LRU lists.

[akpm@linux-foundation.org: clarify awkward comment text]
Signed-off-by: Rafael Aquini <aquini@redhat.com>
Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/balloon_compaction.h | 25 +++++++++++++++++++++++++
 mm/migrate.c                       |  2 +-
 mm/vmscan.c                        |  4 +++-
 3 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index f7f1d7169b11..089743ade734 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -158,6 +158,26 @@ static inline bool balloon_page_movable(struct page *page)
 	return false;
 }
 
+/*
+ * isolated_balloon_page - identify an isolated balloon page on private
+ *			   compaction/migration page lists.
+ *
+ * After a compaction thread isolates a balloon page for migration, it raises
+ * the page refcount to prevent concurrent compaction threads from re-isolating
+ * the same page. For that reason putback_movable_pages(), or other routines
+ * that need to identify isolated balloon pages on private pagelists, cannot
+ * rely on balloon_page_movable() to accomplish the task.
+ */
+static inline bool isolated_balloon_page(struct page *page)
+{
+	/* Already isolated balloon pages, by default, have a raised refcount */
+	if (page_flags_cleared(page) && !page_mapped(page) &&
+	    page_count(page) >= 2)
+		return __is_movable_balloon_page(page);
+
+	return false;
+}
+
 /*
  * balloon_page_insert - insert a page into the balloon's page list and make
  *		         the page->mapping assignment accordingly.
@@ -243,6 +263,11 @@ static inline bool balloon_page_movable(struct page *page)
 	return false;
 }
 
+static inline bool isolated_balloon_page(struct page *page)
+{
+	return false;
+}
+
 static inline bool balloon_page_isolate(struct page *page)
 {
 	return false;
diff --git a/mm/migrate.c b/mm/migrate.c
index 9c8d5f59d30b..a26bccd44ccb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -107,7 +107,7 @@ void putback_movable_pages(struct list_head *l)
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
-		if (unlikely(balloon_page_movable(page)))
+		if (unlikely(isolated_balloon_page(page)))
 			balloon_page_putback(page);
 		else
 			putback_lru_page(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index beb35778c69f..53f2f82f83ae 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,7 @@
 #include <asm/div64.h>
 
 #include <linux/swapops.h>
+#include <linux/balloon_compaction.h>
 
 #include "internal.h"
 
@@ -1113,7 +1114,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 	LIST_HEAD(clean_pages);
 
 	list_for_each_entry_safe(page, next, page_list, lru) {
-		if (page_is_file_cache(page) && !PageDirty(page)) {
+		if (page_is_file_cache(page) && !PageDirty(page) &&
+		    !isolated_balloon_page(page)) {
 			ClearPageActive(page);
 			list_move(&page->lru, &clean_pages);
 		}
-- 
cgit v1.2.3-71-gd317


From 45906723578f768d029ba7774cd97b435f2c5125 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 30 Sep 2013 14:16:41 +0200
Subject: skbuff: size of hole is wrong in a comment

Since commit c93bdd0e03e8 ("netvm: allow skb allocation to use PFMEMALLOC
reserves"), hole size is one bit less than what is written in the comment.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2ddb48d9312c..c2d89335f637 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -498,7 +498,7 @@ struct sk_buff {
 	 * headers if needed
 	 */
 	__u8			encapsulation:1;
-	/* 7/9 bit hole (depending on ndisc_nodetype presence) */
+	/* 6/8 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
 #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL
-- 
cgit v1.2.3-71-gd317